######################################################################################################### #Fathi_et_al_2022_script2.R #Paper: "Dietary vitamin B6 intake influences the microbial composition and functional potential of the gut microbiome in Parkinson’s disease" #Authors: Ayda Fathi, Helena Sokolovska, Yixuan Zhang, and Yoyo Lee #Date: January 31, 2022 #Purpose: R analysis - nutrient stratification, differential/relative abundance analysis ######################################################################################################### ######################################## Nutrient Stratification ######################################## #load packages library(ggplot2) library(dplyr) library(readxl) library(xlsx) #select a specific set of random numbers and makes the analysis reproducible set.seed(711) #import metadata Excel file parkinsons_metadata <- read_excel("~/4th Year Uni/TERM 2/MICB 447/Datasets/parkinsons_metadata.xlsx") View(parkinsons_metadata) #create data frame and clean up data: #create data frame of metadata PD_data_frame <- data.frame(parkinsons_metadata) #change first column name to remove hashtag (interferes with coding) PD_data_frame <- parkinsons_metadata %>% rename("SampleID" = "#SampleID") #add new column to data frame that sums up vitamin A + equivalents PD_data_frame$Total_Vitamin_A <- (PD_data_frame$Vitamin_A + PD_data_frame$Vitamin_A_equiv) #remove N/A values in nutrition data prior to summary stats na.rm_PD_data_frame <- PD_data_frame %>% filter(!is.na(PUFA), !is.na(SFA), !is.na(Total_Vitamin_A), !is.na(Vitamin_B1), !is.na(Vitamin_B2), !is.na(Niacin), !is.na(Vitamin_B6), !is.na(Vitamin_B12), !is.na(Vitamin_C), !is.na(Vitamin_D), !is.na(Vitamin_E)) #determining quartiles for stratification: #split metadata into 2 data frames: control-only and PD-only na.rm_control_only_data_frame <- filter(na.rm_PD_data_frame, Disease == "Control") na.rm_PD_only_data_frame <- filter(na.rm_PD_data_frame, Disease == "PD") #summary stats for each vitamin intake distribution, using control-only data frame #(to determine quartiles for stratification) summary(na.rm_control_only_data_frame$PUFA) #Min. 1st Qu. Median Mean 3rd Qu. Max. #3.90 10.40 12.90 14.22 17.46 36.20 summary(na.rm_control_only_data_frame$SFA) #Min. 1st Qu. Median Mean 3rd Qu. Max. #6.594 17.936 22.600 25.061 29.900 61.900 summary(na.rm_control_only_data_frame$Total_Vitamin_A) #Min. 1st Qu. Median Mean 3rd Qu. Max. #374.3 1036.3 1380.3 1622.9 1710.5 12601.6 summary(na.rm_control_only_data_frame$Vitamin_B1) #Min. 1st Qu. Median Mean 3rd Qu. Max. #0.4000 0.9726 1.2000 1.3078 1.4262 6.8563 summary(na.rm_control_only_data_frame$Vitamin_B2) #Min. 1st Qu. Median Mean 3rd Qu. Max. #0.600 1.200 1.600 1.634 1.990 4.500 summary(na.rm_control_only_data_frame$Niacin) #Min. 1st Qu. Median Mean 3rd Qu. Max. #8.959 15.183 19.901 20.457 23.400 50.400 summary(na.rm_control_only_data_frame$Vitamin_B6) #Min. 1st Qu. Median Mean 3rd Qu. Max. #0.9535 1.5805 1.9193 2.0082 2.3000 4.8000 summary(na.rm_control_only_data_frame$Vitamin_B12) #Min. 1st Qu. Median Mean 3rd Qu. Max. #0.02695 2.98709 4.80000 5.44874 7.00000 27.50000 summary(na.rm_control_only_data_frame$Vitamin_C) #Min. 1st Qu. Median Mean 3rd Qu. Max. #18.55 80.50 112.80 127.71 157.99 334.02 summary(na.rm_control_only_data_frame$Vitamin_D) #Min. 1st Qu. Median Mean 3rd Qu. Max. #0.000 1.300 2.025 2.387 2.824 8.815 summary(na.rm_control_only_data_frame$Vitamin_E) #Min. 1st Qu. Median Mean 3rd Qu. Max. #3.771 8.800 11.100 12.172 14.878 31.700 #make boxplots of nutrient intake distributions, confirm intake Q1/Q3 for control match summary stats: na.rm_PD_data_frame %>% ggplot(aes(x = Disease, y = PUFA)) + geom_boxplot() + labs(title = "PUFA Intake", x = "Disease status", y = "Intake (g)") + theme_bw() na.rm_PD_data_frame %>% ggplot(aes(x = Disease, y = SFA)) + geom_boxplot() + labs(title = "SFA Intake", x = "Disease status", y = "Intake (g)") + theme_bw() na.rm_PD_data_frame %>% ggplot(aes(x = Disease, y = Total_Vitamin_A)) + geom_boxplot() + labs(title = "(Vitamin A + Equivalents) Intake", x = "Disease status", y = "Intake (mcg)") + theme_bw() na.rm_PD_data_frame %>% ggplot(aes(x = Disease, y = Vitamin_B1)) + geom_boxplot() + labs(title = "Vitamin B1 Intake", x = "Disease status", y = "Intake (mg)") + theme_bw() na.rm_PD_data_frame %>% ggplot(aes(x = Disease, y = Vitamin_B2)) + geom_boxplot() + labs(title = "Vitamin B2 Intake", x = "Disease status", y = "Intake (mg)") + theme_bw() na.rm_PD_data_frame %>% ggplot(aes(x = Disease, y = Niacin)) + geom_boxplot() + labs(title = "Vitamin B3 Intake", x = "Disease status", y = "Intake (mg)") + theme_bw() na.rm_PD_data_frame %>% ggplot(aes(x = Disease, y = Vitamin_B6)) + geom_boxplot() + labs(title = "Vitamin B6 Intake", x = "Disease status", y = "Intake (mg)") + theme_bw() na.rm_PD_data_frame %>% ggplot(aes(x = Disease, y = Vitamin_B12)) + geom_boxplot() + labs(title = "Vitamin B12 Intake", x = "Disease status", y = "Intake (mcg)") + theme_bw() na.rm_PD_data_frame %>% ggplot(aes(x = Disease, y = Vitamin_C)) + geom_boxplot() + labs(title = "Vitamin C Intake", x = "Disease status", y = "Intake (mg)") + theme_bw() na.rm_PD_data_frame %>% ggplot(aes(x = Disease, y = Vitamin_D)) + geom_boxplot() + labs(title = "Vitamin D Intake", x = "Disease status", y = "Intake (mcg)") + theme_bw() na.rm_PD_data_frame %>% ggplot(aes(x = Disease, y = Vitamin_E)) + geom_boxplot() + labs(title = "Vitamin E Intake", x = "Disease status", y = "Intake (mg)") + theme_bw() #stratify nutrient intake data based on intake Q1/Q3 for control: stratified_PD_data_frame <- mutate(PD_data_frame, PUFA_intake_stratified = cut(PUFA, breaks = c(0, 10.40, 17.46, Inf), labels = c("low", "moderate", "high"))) stratified_PD_data_frame <- mutate(stratified_PD_data_frame, SFA_intake_stratified = cut(SFA, breaks = c(0, 17.936, 29.900, Inf), labels = c("low", "moderate", "high"))) stratified_PD_data_frame <- mutate(stratified_PD_data_frame, Total_Vitamin_A_intake_stratified = cut(Total_Vitamin_A, breaks = c(0, 1036.3, 1710.5, Inf), labels = c("low", "moderate", "high"))) stratified_PD_data_frame <- mutate(stratified_PD_data_frame, Vitamin_B1_intake_stratified = cut(Vitamin_B1, breaks = c(0, 0.9726, 1.4262, Inf), labels = c("low", "moderate", "high"))) stratified_PD_data_frame <- mutate(stratified_PD_data_frame, Vitamin_B2_intake_stratified = cut(Vitamin_B2, breaks = c(0, 1.200, 1.990, Inf), labels = c("low", "moderate", "high"))) stratified_PD_data_frame <- mutate(stratified_PD_data_frame, Vitamin_B3_intake_stratified = cut(Niacin, breaks = c(0, 15.183, 23.400, Inf), labels = c("low", "moderate", "high"))) stratified_PD_data_frame <- mutate(stratified_PD_data_frame, Vitamin_B6_intake_stratified = cut(Vitamin_B6, breaks = c(0, 1.5805, 2.3000, Inf), labels = c("low", "moderate", "high"))) stratified_PD_data_frame <- mutate(stratified_PD_data_frame, Vitamin_B12_intake_stratified = cut(Vitamin_B12, breaks = c(0, 2.98709, 7.00000, Inf), labels = c("low", "moderate", "high"))) stratified_PD_data_frame <- mutate(stratified_PD_data_frame, Vitamin_C_intake_stratified = cut(Vitamin_C, breaks = c(0, 80.50, 157.99, Inf), labels = c("low", "moderate", "high"))) stratified_PD_data_frame <- mutate(stratified_PD_data_frame, Vitamin_D_intake_stratified = cut(Vitamin_D, breaks = c(-Inf, 1.300, 2.824, Inf), labels = c("low", "moderate", "high"))) stratified_PD_data_frame <- mutate(stratified_PD_data_frame, Vitamin_E_intake_stratified = cut(Vitamin_E, breaks = c(0, 8.800, 14.878, Inf), labels = c("low", "moderate", "high"))) #final output = stratified_PD_data_frame (111 columns, 300 rows) #startifying based on disease and dose intake #creating a new variable combining the disease to nutrient intake: parkinsons_metadata_Stratified <- read_excel("+B3_stratified_na.rm_PD_metadata.xlsx") View(parkinsons_metadata_Stratified) #create data frame of metadata PD_data_frame <- data.frame(parkinsons_metadata_Stratified) #add new column to data frame that sums up nutrient intake and disease status PD_data_frame$Total_Vitamin_A_and_Disease <- paste(PD_data_frame$Total_Vitamin_A_intake_stratified, "and", PD_data_frame$Disease) PD_data_frame$Total_Vitamin_B1_and_Disease <- paste(PD_data_frame$Vitamin_B1_intake_stratified, "and", PD_data_frame$Disease) PD_data_frame$Total_Vitamin_B2_and_Disease <- paste(PD_data_frame$Vitamin_B2_intake_stratified, "and", PD_data_frame$Disease) PD_data_frame$Total_Vitamin_B3_and_Disease <- paste(PD_data_frame$Vitamin_B3_intake_stratified, "and", PD_data_frame$Disease) PD_data_frame$Total_Vitamin_B6_and_Disease <- paste(PD_data_frame$Vitamin_B6_intake_stratified, "and", PD_data_frame$Disease) PD_data_frame$Total_Vitamin_B12_and_Disease <- paste(PD_data_frame$Vitamin_B12_intake_stratified, "and", PD_data_frame$Disease) PD_data_frame$Total_Vitamin_C_and_Disease <- paste(PD_data_frame$Vitamin_C_intake_stratified, "and", PD_data_frame$Disease) PD_data_frame$Total_Vitamin_D_and_Disease <- paste(PD_data_frame$Vitamin_D_intake_stratified, "and", PD_data_frame$Disease) PD_data_frame$Total_Vitamin_E_and_Disease <- paste(PD_data_frame$Vitamin_E_intake_stratified, "and", PD_data_frame$Disease) PD_data_frame$SFA_intake_stratified_and_Disease<- paste(PD_data_frame$SFA_intake_stratified, "and", PD_data_frame$Disease) PD_data_frame$PUFA_intake_stratified_and_Disease<- paste(PD_data_frame$PUFA_intake_stratified, "and", PD_data_frame$Disease) #create box plots of nutrients combined with disease PD_data_frame %>% ggplot(aes(x = PUFA_intake_stratified_and_Disease, y = PUFA)) + geom_boxplot() + labs(title = "PUFA Intake and Disease", x = "Intake dose and disease status", y = "PUFA (g)") + theme_bw() PD_data_frame %>% ggplot(aes(x = SFA_intake_stratified_and_Disease, y = SFA)) + geom_boxplot() + labs(title = "SFA Intake and Disease", x = "Intake dose and disease status", y = "SFA (g)") + theme_bw() PD_data_frame %>% ggplot(aes(x = Total_Vitamin_A_and_Disease, y = Total_Vitamin_A)) + geom_boxplot() + labs(title = "(Vitamin A + Equivalents) Intake and Disease", x = "Intake dose and disease status", y = "Intake (mcg)") + theme_bw() PD_data_frame %>% ggplot(aes(x = Total_Vitamin_B1_and_Disease, y = Vitamin_B1)) + geom_boxplot() + labs(title = "Vitamin B1 Intake and Disease", x = "Intake dose and disease status", y = "Intake (mg)") + theme_bw() PD_data_frame %>% ggplot(aes(x = Total_Vitamin_B2_and_Disease, y = Vitamin_B2)) + geom_boxplot() + labs(title = "Vitamin B2 Intake and Disease", x = "Intake dose and disease status", y = "Intake (mg)") + theme_bw() PD_data_frame %>% ggplot(aes(x = Total_Vitamin_B3_and_Disease, y = Niacin)) + geom_boxplot() + labs(title = "Vitamin B3 Intake and Disease", x = "Intake dose and disease status", y = "Intake (mg)") + theme_bw() PD_data_frame %>% ggplot(aes(x = Total_Vitamin_B6_and_Disease, y = Vitamin_B6)) + geom_boxplot() + labs(title = "Vitamin B6 Intake and Disease", x = "Intake dose and disease status", y = "Intake (mg)") + theme_bw() PD_data_frame %>% ggplot(aes(x = Total_Vitamin_B12_and_Disease, y = Vitamin_B12)) + geom_boxplot() + labs(title = "Vitamin B12 Intake and Disease", x = "Intake dose and disease status", y = "Intake (mcg)") + theme_bw() PD_data_frame %>% ggplot(aes(x = Total_Vitamin_C_and_Disease, y = Vitamin_C)) + geom_boxplot() + labs(title = "Vitamin C Intake and Disease", x = "Intake dose and disease status", y = "Intake (mg)") + theme_bw() PD_data_frame %>% ggplot(aes(x = Total_Vitamin_D_and_Disease, y = Vitamin_D)) + geom_boxplot() + labs(title = "Vitamin D Intake and Disease", x = "Intake dose and disease status", y = "Intake (mcg)") + theme_bw() PD_data_frame %>% ggplot(aes(x = Total_Vitamin_E_and_Disease, y = Vitamin_E)) + geom_boxplot() + labs(title = "Vitamin E Intake and Disease", x = "Intake dose and disease status", y = "Intake (mg)") + theme_bw() #export data frame as xlsx write.xlsx(PD_data_frame, "/Users/Ayda/Desktop/stratified_PD_Disease_Combined_metadata.xlsx") ######################################## Differential Abundance Analysis: PD vs. Non-PD ######################################## #loading CRAN packages library(tidyverse) library(vegan) library(ape) #load Bioconductor packages library(phyloseq) library(DESeq2) #provide custom function calculate_relative_abundance <- function(x) x / sum(x) calculate_gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x)) } #setting random numbers set.seed(711) #importing Qiime2 data into R biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("colrm_stratified_PD_Disease_Combined_metadata.tsv") tree <- read_tree_greengenes ("tree.nwk") #convert tree from multichotomous to dichotomous tree <- multi2di(tree) # Combine all information into a single phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) #assign new taxonomic column name colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class","Order", "Family", "Genus", "Species") head(tax_table(physeq)) #Setting sampling depth sample_sums(physeq) >= 7000 #Filter samples based on sampling depth at_least_7000 <- prune_samples(sample_sums(physeq) >= 7000, physeq) sample_sums(at_least_7000) #Getting most abundant taxa at the genus level #remove low abundance features total_counts <- taxa_sums(at_least_7000) relative_abundance <- calculate_relative_abundance(total_counts) abundant <- relative_abundance > 0.001 abundant_taxa <- prune_taxa(abundant, at_least_7000) abundant_taxa #set taxonomic level to genus abundant_genera <- tax_glom(abundant_taxa, taxrank = "Genus") abundant_genera #DESeq2 analysis genus deseq_genera <- phyloseq_to_deseq2(abundant_genera, ~ Disease) geo_means_genera<- apply(counts(deseq_genera), 1, calculate_gm_mean) deseq_genera <- estimateSizeFactors(deseq_genera, geoMeans = geo_means_genera) deseq_genera <- DESeq(deseq_genera, fitType = "local") diff_abund_genera<- results(deseq_genera) #filter for data with p-value below alpha (0.05) alpha <- 0.05 significant_genera <- as.data.frame(diff_abund_genera) significant_genera <- filter(significant_genera, padj < alpha) #merge tables with significant results with table of taxonomic information genera_df <- as.data.frame(tax_table(abundant_genera)) significant_genera <- merge(significant_genera, genera_df, by = "row.names") significant_genera <- arrange(significant_genera, log2FoldChange) dim(significant_genera) significant_genera #create differential abundance genera plot significant_genera <- filter(significant_genera, Genus != "g__") significant_genera <- mutate(significant_genera, Genus = factor(Genus, levels = Genus)) ggplot(significant_genera, aes(x = log2FoldChange, y = Genus)) + geom_bar(stat = "identity") + labs(title = "Differential abundant genera control vs PD", x = expression(log[2]~fold~change), y = "Genus") + theme_bw() #set taxonomic level to family abundant_family <- tax_glom(abundant_taxa, taxrank = "Family") abundant_family #DESeq2 analysis family deseq_family <- phyloseq_to_deseq2(abundant_family, ~ Disease) geo_means_family <- apply(counts(deseq_family), 1, calculate_gm_mean) deseq_family <- estimateSizeFactors(deseq_family, geoMeans = geo_means_family) deseq_family <- DESeq(deseq_family, fitType = "local") diff_abund_family <- results(deseq_family) #filter for data with p-value below alpha (0.05) alpha <- 0.05 significant_family <- as.data.frame(diff_abund_family) significant_family <- filter(significant_family, padj < alpha) #merge tables with significant results with table of taxonomic information family_df <- as.data.frame(tax_table(abundant_family)) significant_family <- merge(significant_family, family_df, by = "row.names") significant_family <- arrange(significant_family, log2FoldChange) dim(significant_family) significant_family #create differential abundance family plot significant_family <- mutate(significant_family, Family = factor(Family, levels = unique(Family))) ggplot(significant_family, aes(x = log2FoldChange, y = Family)) + geom_bar(stat = "identity") + labs(title = "Differential abundant family control vs PD", x = expression(log[2]~fold~change), y = "Family") + theme_bw() #set taxonomic level to species abundant_species <- tax_glom(abundant_taxa, taxrank = "Species") abundant_species #DESeq2 analysis species deseq_genus_species <- phyloseq_to_deseq2(abundant_species, ~ Disease) geo_means_species <- apply(counts(deseq), 1, calculate_gm_mean) deseq_species <- estimateSizeFactors(deseq, geoMeans = geo_means_species) deseq_species <- DESeq(deseq_species, fitType = "local") diff_abund_species <- results(deseq_species) #filter for data with p-value below alpha (0.05) alpha <- 0.05 significant_species <- as.data.frame(diff_abund_species) significant_species <- filter(significant_species, padj < alpha) #merge tables with significant results with table of taxonomic information species_df <- as.data.frame(tax_table(abundant_species)) significant_species <- merge(significant_species, species_df, by = "row.names") significant_species <- arrange(significant_species, log2FoldChange) dim(significant_species) significant_species ######################################## Relative Abundance Analysis: PD vs. Non-PD ######################################## #calculate relative abundance at_least_7000_RA <- transform_sample_counts(at_least_7000, calculate_relative_abundance) #remove low abundant data total_counts <- taxa_sums(at_least_7000) relative_abundance <- calculate_relative_abundance(total_counts) abundant <- relative_abundance > 0.001 abundant_RA_taxa <- prune_taxa(abundant, at_least_7000_RA) #set taxonomic rank to genus abundant_RA_genera <- tax_glom(abundant_RA_taxa, taxrank = "Genus") bifidobacterium <- subset_taxa(abundant_RA_genera, Genus == "g__Bifidobacterium") otu_table(bifidobacterium) bifidobacterium_long <- psmelt(bifidobacterium) bifidobacterium_long #plot with ggplot ggplot(bifidobacterium_long, aes(x = Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance of Bifidobacterium", x = "Disease", y = "Relative abundance") #set taxonomic rank to genus abundant_RA_genera <- tax_glom(abundant_RA_taxa, taxrank = "Genus") akkermansia <- subset_taxa(abundant_RA_genera, Genus == "g__Akkermansia") otu_table(akkermansia) akkermansia_long <- psmelt(akkermansia) akkermansia_long #plot with ggplot ggplot(akkermansia_long, aes(x = Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance of Akkermansia", x = "Disease", y = "Relative abundance") #set taxonomic rank to genus abundant_RA_genera <- tax_glom(abundant_RA_taxa, taxrank = "Genus") collinsella <- subset_taxa(abundant_RA_genera, Genus == "g__Collinsella") otu_table(collinsella) collinsella_long <- psmelt(collinsella) collinsella_long #plot with ggplot ggplot(collinsella_long, aes(x = Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance of Collinsella", x = "Disease", y = "Relative abundance") #set taxonomic rank to genus abundant_RA_genera <- tax_glom(abundant_RA_taxa, taxrank = "Genus") faecalibacterium <- subset_taxa(abundant_RA_genera, Genus == "g__Faecalibacterium") otu_table(faecalibacterium) faecalibacterium_long <- psmelt(faecalibacterium) faecalibacterium_long #plot with ggplot ggplot(faecalibacterium_long, aes(x = Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance of Faecalibacterium", x = "Disease", y = "Relative abundance") #set taxonomic rank to genus abundant_RA_genera <- tax_glom(abundant_RA_taxa, taxrank = "Genus") roseburia <- subset_taxa(abundant_RA_genera, Genus == "g__Roseburia") otu_table(roseburia) roseburia_long <- psmelt(roseburia) roseburia_long #plot with ggplot ggplot(roseburia_long, aes(x = Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance of Roseburia", x = "Disease", y = "Relative abundance") #set taxonomic rank to genus abundant_RA_genera <- tax_glom(abundant_RA_taxa, taxrank = "Genus") Oscillibacter <- subset_taxa(abundant_RA_genera, Genus == "g__Oscillibacter") otu_table(Oscillibacter) Oscillibacter_long <- psmelt(Oscillibacter) Oscillibacter_long #plot with ggplot ggplot(Oscillibacter_long, aes(x = Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance of Oscillibacter", x = "Disease", y = "Relative abundance") #set taxonomic level to genus abundant_RA_genera <- tax_glom(abundant_taxa, taxrank = "Genus") Eubacterium_coprostanoligenes_group <- subset_taxa(abundant_RA_genera, Genus == "g__[Eubacterium]_coprostanoligenes_group") otu_table(Eubacterium_coprostanoligenes_group) Eubacterium_coprostanoligenes_group_long <- psmelt(Eubacterium_coprostanoligenes_group) Eubacterium_coprostanoligenes_group_long #plot with ggplot ggplot(Eubacterium_coprostanoligenes_group_long, aes(x = Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance of Eubacterium coprostanoligenes group", x = "Disease", y = "Relative abundance") ######################################## Differential Abundance Analysis: Non-PD, Low vs. High Vitamin B1 ######################################## #loading CRAN packages library(tidyverse) library(vegan) library(ape) #load Bioconductor packages library(phyloseq) library(DESeq2) #provide custom function calculate_relative_abundance <- function(x) x / sum(x) calculate_gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x)) } #setting random numbers set.seed(711) #importing Qiime2 data into R biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("colrm_stratified_PD_Disease_Combined_metadata.tsv") tree <- read_tree_greengenes("tree.nwk") #convert tree from multichotomous to dichotomous tree <- multi2di(tree) # Combine all information into a single phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) #assign new taxonomic column name colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class","Order", "Family", "Genus", "Species") head(tax_table(physeq)) #Setting sampling depth sample_sums(physeq) >= 7000 #Filter samples based on sampling depth at_least_7000 <- prune_samples(sample_sums(physeq) >= 7000, physeq) sample_sums(at_least_7000) #filter based on metadata vitb1 <- subset_samples(at_least_7000, Total_Vitamin_B1_and_Disease %in% c("low and Control", "high and Control")) #remove low abundance features total_counts <- taxa_sums(vitb1) relative_abundance <- calculate_relative_abundance(total_counts) abundant <- relative_abundance > 0.001 abundant_taxa <- prune_taxa(abundant, vitb1) abundant_taxa #set taxonomic level to genus abundant_genera <- tax_glom(abundant_taxa, taxrank = "Genus") abundant_genera #convert variable to 2 categories sample_data(abundant_genera)$Total_Vitamin_B1_and_Disease <- factor(sample_data(abundant_genera)$Total_Vitamin_B1_and_Disease, levels = c("low and Control", "high and Control")) #DESeq2 analysis genus deseq <- phyloseq_to_deseq2(abundant_genera, ~ Total_Vitamin_B1_and_Disease) geo_means <- apply(counts(deseq), 1, calculate_gm_mean) deseq <- estimateSizeFactors(deseq, geoMeans = geo_means) deseq <- DESeq(deseq, fitType = "local") diff_abund <- results(deseq) #filter for data with p-value below alpha (0.05) alpha <- 0.05 significant <- as.data.frame(diff_abund) significant <- filter(significant, padj < alpha) #merge tables with significant results with table of taxonomic information genera_df <- as.data.frame(tax_table(abundant_genera)) significant <- merge(significant, genera_df, by = "row.names") significant <- arrange(significant, log2FoldChange) dim(significant) significant #create differential abundance genera plot significant <- filter(significant, Genus != "g__") significant <- mutate(significant, Genus = factor(Genus, levels = Genus)) ggplot(significant, aes(x = log2FoldChange, y = Genus)) + geom_bar(stat = "identity") + labs(title = "Differential abundant genera non-PD low Vitamin B1 vs non-PD high Vitamin B1", x = expression(log[2]~fold~change), y = "Genus") + theme_bw() ######################################## Relative Abundance Analysis: Non-PD, Low vs. High Vitamin B1 ######################################## #relative abundance (non PD Vitamin B1) #calculate relative abundance vitb1_RA <- transform_sample_counts(vitb1, calculate_relative_abundance) #remove low abundance features total_counts <- taxa_sums(vitb1) relative_abundance <- calculate_relative_abundance(total_counts) abundant <- relative_abundance > 0.001 abundant_taxa <- prune_taxa(abundant, vitb1_RA) #set taxonomic level to genus abundant_RA_genera <- tax_glom(abundant_taxa, taxrank = "Genus") Muribaculaceae <- subset_taxa(abundant_RA_genera, Genus == "g__Muribaculaceae") otu_table(Muribaculaceae) Muribaculaceae_long <- psmelt(Muribaculaceae) Muribaculaceae_long Prevotellaceae_NK3B31_group <- subset_taxa(abundant_RA_genera, Genus == "g__Prevotellaceae_NK3B31_group") otu_table(Prevotellaceae_NK3B31_group) Prevotellaceae_NK3B31_group_long <- psmelt(Prevotellaceae_NK3B31_group) Prevotellaceae_NK3B31_group_long Bacteroides <- subset_taxa(abundant_RA_genera, Genus == "g__Bacteroides") otu_table(Bacteroides) Bacteroides_long <- psmelt(Bacteroides) Bacteroides_long Lachnoclostridium <- subset_taxa(abundant_RA_genera, Genus == "g__Lachnoclostridium") otu_table(Lachnoclostridium) Lachnoclostridium_long <- psmelt(Lachnoclostridium) Lachnoclostridium_long Gastranaerophilales <- subset_taxa(abundant_RA_genera, Genus == "g__Gastranaerophilales") otu_table(Gastranaerophilales) Gastranaerophilales_long <- psmelt(Gastranaerophilales) Gastranaerophilales_long #relative abundance plots ggplot(Muribaculaceae_long, aes(x = Total_Vitamin_B1_and_Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance of Muribaculaceae", x = "Vitamin B1 intake and disease", y = "Relative abundance") ggplot(Prevotellaceae_NK3B31_group_long, aes(x = Total_Vitamin_B1_and_Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance of Prevotellaceae_NK3B31_group", x = "Vitamin B1 intake and disease", y = "Relative abundance") ggplot(Bacteroides_long, aes(x = Total_Vitamin_B1_and_Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance of Bacteroides", x = "Vitamin B1 intake and disease", y = "Relative abundance") ggplot(Lachnoclostridium_long, aes(x = Total_Vitamin_B1_and_Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance of Lachnoclostridium", x = "Vitamin B1 intake and disease", y = "Relative abundance") ggplot(Gastranaerophilales_long, aes(x = Total_Vitamin_B1_and_Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance of Gastranaerophilales", x = "Vitamin B1 intake and disease", y = "Relative abundance") ######################################## Differential Abundance Analysis: PD, Low vs. High Vitamin B1 ######################################## #loading CRAN packages library(tidyverse) library(vegan) library(ape) #load Bioconductor packages library(phyloseq) library(DESeq2) #provide custom function calculate_relative_abundance <- function(x) x / sum(x) calculate_gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x)) } #setting random numbers set.seed(711) #importing Qiime2 data into R biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("colrm_stratified_PD_Disease_Combined_metadata.tsv") tree <- read_tree_greengenes("tree.nwk") #convert tree from multichotomous to dichotomous tree <- multi2di(tree) # Combine all information into a single phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) #assign new taxonomic column name colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class","Order", "Family", "Genus", "Species") head(tax_table(physeq)) #Setting sampling depth sample_sums(physeq) >= 7000 #Filter samples based on sampling depth at_least_7000 <- prune_samples(sample_sums(physeq) >= 7000, physeq) sample_sums(at_least_7000) #filter based on metadata vitb1 <- subset_samples(at_least_7000, Total_Vitamin_B1_and_Disease %in% c("low and PD", "high and PD")) #remove low abundance features total_counts <- taxa_sums(vitb1) relative_abundance <- calculate_relative_abundance(total_counts) abundant <- relative_abundance > 0.001 abundant_taxa <- prune_taxa(abundant, vitb1) abundant_taxa #set taxonomic level to genus abundant_genera <- tax_glom(abundant_taxa, taxrank = "Genus") abundant_genera #convert variable to 2 categories sample_data(abundant_genera)$Total_Vitamin_B1_and_Disease <- factor(sample_data(abundant_genera)$Total_Vitamin_B1_and_Disease, levels = c("low and PD", "high and PD")) #DESeq2 analysis genus deseq <- phyloseq_to_deseq2(abundant_genera, ~ Total_Vitamin_B1_and_Disease) geo_means <- apply(counts(deseq), 1, calculate_gm_mean) deseq <- estimateSizeFactors(deseq, geoMeans = geo_means) deseq <- DESeq(deseq, fitType = "local") diff_abund <- results(deseq) #filter for data with p-value below alpha (0.05) alpha <- 0.05 significant <- as.data.frame(diff_abund) significant <- filter(significant, padj < alpha) #merge tables with significant results with table of taxonomic information genera_df <- as.data.frame(tax_table(abundant_genera)) significant <- merge(significant, genera_df, by = "row.names") significant <- arrange(significant, log2FoldChange) dim(significant) significant #create differential abundance genera plot significant <- filter(significant, Genus != "g__") significant <- mutate(significant, Genus = factor(Genus, levels = Genus)) ggplot(significant, aes(x = log2FoldChange, y = Genus)) + geom_bar(stat = "identity") + labs(title = "Differential abundant genera PD low Vitamin B1 vs PD high Vitamin B1", x = expression(log[2]~fold~change), y = "Genus") + theme_bw() ######################################## Differential Abundance Analysis: non-PD, Low vs. High Vitamin B6 ######################################### #loading CRAN packages library(tidyverse) library(vegan) library(ape) #load Bioconductor packages library(phyloseq) library(DESeq2) #provide custom function calculate_relative_abundance <- function(x) x / sum(x) calculate_gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x)) } #setting random numbers set.seed(711) #importing Qiime2 data into R biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("colrm_stratified_PD_Disease_Combined_metadata.tsv") tree <- read_tree_greengenes("tree.nwk") #convert tree from multichotomous to dichotomous tree <- multi2di(tree) # Combine all information into a single phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) #assign new taxonomic column name colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class","Order", "Family", "Genus", "Species") head(tax_table(physeq)) #Setting sampling depth sample_sums(physeq) >= 7000 #Filter samples based on sampling depth at_least_7000 <- prune_samples(sample_sums(physeq) >= 7000, physeq) sample_sums(at_least_7000) #filter based on metadata vitb6 <- subset_samples(at_least_7000, Total_Vitamin_B6_and_Disease %in% c("low and Control", "high and Control")) #remove low abundance features total_counts <- taxa_sums(vitb6) relative_abundance <- calculate_relative_abundance(total_counts) abundant <- relative_abundance > 0.001 abundant_taxa <- prune_taxa(abundant, vitb6) abundant_taxa #set taxonomic level to genus abundant_genera <- tax_glom(abundant_taxa, taxrank = "Genus") abundant_genera #convert variable to 2 categories sample_data(abundant_genera)$Total_Vitamin_B6_and_Disease <- factor(sample_data(abundant_genera)$Total_Vitamin_B6_and_Disease, levels = c("low and Control", "high and Control")) #DESeq2 analysis genus deseq <- phyloseq_to_deseq2(abundant_genera, ~ Total_Vitamin_B6_and_Disease) geo_means <- apply(counts(deseq), 1, calculate_gm_mean) deseq <- estimateSizeFactors(deseq, geoMeans = geo_means) deseq <- DESeq(deseq, fitType = "local") diff_abund <- results(deseq) #filter for data with p-value below alpha (0.05) alpha <- 0.05 significant <- as.data.frame(diff_abund) significant <- filter(significant, padj < alpha) #merge tables with significant results with table of taxonomic information genera_df <- as.data.frame(tax_table(abundant_genera)) significant <- merge(significant, genera_df, by = "row.names") significant <- arrange(significant, log2FoldChange) dim(significant) significant #create differential abundance genera plot significant <- filter(significant, Genus != "g__") significant <- mutate(significant, Genus = factor(Genus, levels = Genus)) ggplot(significant, aes(x = log2FoldChange, y = Genus)) + geom_bar(stat = "identity") + labs(title = "Differential abundant genera non-PD low Vitamin B6 vs non-PD high Vitamin B6", x = expression(log[2]~fold~change), y = "Genus") + theme_bw() ######################################## Relative Abundance Analysis: non-PD, Low vs. High Vitamin B6 ######################################### #relative abundance of Prevotellaceae_NK3B31_group (non PD Vitamin B6) #calculate relative abundance vitb6_RA <- transform_sample_counts(vitb6, calculate_relative_abundance) #remove low abundance features total_counts <- taxa_sums(vitb6) relative_abundance <- calculate_relative_abundance(total_counts) abundant <- relative_abundance > 0.001 abundant_taxa <- prune_taxa(abundant, vitb6_RA) #set taxonomic level to genus abundant_RA_genera <- tax_glom(abundant_taxa, taxrank = "Genus") Prevotellaceae_NK3B31_group <- subset_taxa(abundant_RA_genera, Genus == "g__Prevotellaceae_NK3B31_group") otu_table(Prevotellaceae_NK3B31_group) Prevotellaceae_NK3B31_group_long <- psmelt(Prevotellaceae_NK3B31_group) Prevotellaceae_NK3B31_group_long #plot with ggplot ggplot(Prevotellaceae_NK3B31_group_long, aes(x = Total_Vitamin_B6_and_Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance of Prevotellaceae_NK3B31_group", x = "Vitamin B6 intake and disease", y = "Relative abundance") ######################################## Differential Abundance Analysis: PD, Low vs. High vitamin B6 ######################################## #loading CRAN packages library(tidyverse) library(vegan) library(ape) #load Bioconductor packages library(phyloseq) library(DESeq2) #provide custom function calculate_relative_abundance <- function(x) x / sum(x) calculate_gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x)) } #setting random numbers set.seed(711) #importing Qiime2 data into R biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("colrm_stratified_PD_Disease_Combined_metadata.tsv") tree <- read_tree_greengenes("tree.nwk") #convert tree from multichotomous to dichotomous tree <- multi2di(tree) # Combine all information into a single phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) #assign new taxonomic column name colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class","Order", "Family", "Genus", "Species") head(tax_table(physeq)) #Setting sampling depth sample_sums(physeq) >= 7000 #Filter samples based on sampling depth at_least_7000 <- prune_samples(sample_sums(physeq) >= 7000, physeq) sample_sums(at_least_7000) #filter based on metadata vitb6 <- subset_samples(at_least_7000, Total_Vitamin_B6_and_Disease %in% c("low and PD", "high and PD")) #remove low abundance features total_counts <- taxa_sums(vitb6) relative_abundance <- calculate_relative_abundance(total_counts) abundant <- relative_abundance > 0.001 abundant_taxa <- prune_taxa(abundant, vitb6) abundant_taxa #set taxonomic level to genus abundant_genera <- tax_glom(abundant_taxa, taxrank = "Genus") abundant_genera #convert variable to 2 categories sample_data(abundant_genera)$Total_Vitamin_B6_and_Disease <- factor(sample_data(abundant_genera)$Total_Vitamin_B6_and_Disease, levels = c("low and PD", "high and PD")) #DESeq2 analysis genus deseq <- phyloseq_to_deseq2(abundant_genera, ~ Total_Vitamin_B6_and_Disease) geo_means <- apply(counts(deseq), 1, calculate_gm_mean) deseq <- estimateSizeFactors(deseq, geoMeans = geo_means) deseq <- DESeq(deseq, fitType = "local") diff_abund <- results(deseq) #filter for data with p-value below alpha (0.05) alpha <- 0.05 significant <- as.data.frame(diff_abund) significant <- filter(significant, padj < alpha) #merge tables with significant results with table of taxonomic information genera_df <- as.data.frame(tax_table(abundant_genera)) significant <- merge(significant, genera_df, by = "row.names") significant <- arrange(significant, log2FoldChange) dim(significant) significant #create differential abundance genera plot significant <- filter(significant, Genus != "g__") significant <- mutate(significant, Genus = factor(Genus, levels = Genus)) ggplot(significant, aes(x = log2FoldChange, y = Genus)) + geom_bar(stat = "identity") + labs(title = "Differential abundant genera PD low Vitamin B6 vs PD high Vitamin B6", x = expression(log[2]~fold~change), y = "Genus") + theme_bw()