# Haq et al R Script #Loading all necessary packages library(tidyverse) library(vegan) library(ape) library(phyloseq) library(DESeq2) #Differential Abundance for Infant Use of Probiotics #Provide custom functions calculate_relative_abundance <- function(x) x / sum(x) calculate_gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x [x > 0]), na.mr = na.rm) / length (x)) } set.seed(711) biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("infant_metadata.tsv") tree <- read_tree_greengenes("tree.nwk") # Convert from multichotomous to dichotmous tree tree <- multi2di(tree) # Combine all information into a single phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) #Look at first 6 rows of metadata head(sample_data(physeq)) #Assign new taxonomic rank column names colnames(tax_table(physeq)) <- c("Kingdom","Phylum", "Class", "Order", "Family", "Genus", "Species") head(tax_table(physeq)) #Filtering by depth sample_sums(physeq) sample_sums(physeq) >= 1000 at_least_1000 <- prune_samples(sample_sums(physeq) >= 1000, physeq) sample_sums(at_least_1000) #Differential abundance analysis #Filtering based on metadata infant <- subset_samples(at_least_1000, life_stage == "Infant") infant <- subset_samples(infant, probiotic_inf != "not collected") #Removing low-abundant features (step 5) infant_counts <- taxa_sums(infant) relative_abundance_infant <- calculate_relative_abundance(infant_counts) abundant_infant <- relative_abundance_infant > 0.0005 abundant_infant_taxa <- prune_taxa(abundant_infant, infant) #Set taxanomic level (step 6) abundant_infant_genera <- tax_glom(abundant_infant_taxa, taxrank = "Genus") sample_data(abundant_infant_genera)$probiotic_inf <- factor (sample_data(abundant_infant_genera)$probiotic_inf, levels = c("yes", "no")) #Discussion help infant <- prune_samples(sample_sums(infant) > 0, infant) #DESeq2 analysis deseq_infant <- phyloseq_to_deseq2(abundant_infant_genera, ~ probiotic_inf) geo_means <- apply(counts(deseq_infant), 1, calculate_gm_mean) #Make sure to filter out low quality deseq_infant <- estimateSizeFactors(deseq_infant, geoMeans = geo_means) deseq_infant <- DESeq(deseq_infant, fitType = "local") infant_diff_abund <- results(deseq_infant) #Determine significant results alpha <- 0.05 significant_infant <- as.data.frame(infant_diff_abund) significant_infant <- filter(significant_infant, padj < alpha) #Add taxonomic information genera_df <- as.data.frame(tax_table(abundant_infant_genera)) significant_infant <- merge(significant_infant, genera_df, by = "row.names") significant_infant <- arrange(significant_infant, log2FoldChange) significant_infant <- mutate(significant_infant, Genus = factor(Genus, levels = Genus)) #Plot ggplot(significant_infant, aes(x = log2FoldChange, y = Genus)) + geom_bar(stat = "identity") + labs(title = "Differential abundant genera in infant probiotic use", x = "log2 fold-change", y = "Genus") + theme_bw()+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ geom_bar(stat="identity", fill="#FF6565", colour="black") #Differential Abundance for Mother Use of Probiotics #Provide custom functions calculate_relative_abundance <- function(x) x / sum(x) calculate_gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x [x > 0]), na.mr = na.rm) / length (x)) } set.seed(711) biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("infant_metadata.tsv") tree <- read_tree_greengenes("tree.nwk") # Convert from multichotomous to dichotmous tree tree <- multi2di(tree) # Combine all information into a single phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) #Look at first 6 rows of metadata head(sample_data(physeq)) #Assign new taxonomic rank column names colnames(tax_table(physeq)) <- c("Kingdom","Phylum", "Class", "Order", "Family", "Genus", "Species") head(tax_table(physeq)) #Filtering by depth sample_sums(physeq) sample_sums(physeq) >= 1000 at_least_1000 <- prune_samples(sample_sums(physeq) >= 1000, physeq) sample_sums(at_least_1000) #Differential abundance analysis #Filtering based on metadata infant <- subset_samples(at_least_1000, life_stage == "Infant") infant <- subset_samples(infant, probiotic_mom != "not collected") #Removing low-abundant features (step 5) infant_counts <- taxa_sums(infant) relative_abundance_infant <- calculate_relative_abundance(infant_counts) abundant_infant <- relative_abundance_infant > 0.0005 abundant_infant_taxa <- prune_taxa(abundant_infant, infant) #Set taxanomic level (step 6) abundant_infant_genera <- tax_glom(abundant_infant_taxa, taxrank = "Genus") sample_data(abundant_infant_genera)$probiotic_mom <- factor (sample_data(abundant_infant_genera)$probiotic_mom, levels = c("yes", "no")) #Discussion help infant <- prune_samples(sample_sums(infant) > 0, infant) #DESeq2 analysis deseq_infant <- phyloseq_to_deseq2(abundant_infant_genera, ~ probiotic_mom) geo_means <- apply(counts(deseq_infant), 1, calculate_gm_mean) #Make sure to filter out low quality deseq_infant <- estimateSizeFactors(deseq_infant, geoMeans = geo_means) deseq_infant <- DESeq(deseq_infant, fitType = "local") infant_diff_abund <- results(deseq_infant) #Determine significant results alpha <- 0.05 significant_infant <- as.data.frame(infant_diff_abund) significant_infant <- filter(significant_infant, padj < alpha) #Add taxonomic information genera_df <- as.data.frame(tax_table(abundant_infant_genera)) significant_infant <- merge(significant_infant, genera_df, by = "row.names") significant_infant <- arrange(significant_infant, log2FoldChange) significant_infant <- mutate(significant_infant, Genus = factor(Genus, levels = Genus)) #Plot ggplot(significant_infant, aes(x = log2FoldChange, y = Genus)) + geom_bar(stat = "identity") + labs(title = "Differential abundant genera in mother probiotic use", x = "log2 fold-change", y = "Genus") + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ geom_bar(stat="identity", fill="#FF6565", colour="black") #Differential Abundance for the 1-14-27 cohort #Provide custom functions calculate_relative_abundance <- function(x) x / sum(x) calculate_gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x [x > 0]), na.mr = na.rm) / length (x)) } set.seed(711) biom_file <- import_biom("table-with-taxonomy3.biom") metadata <- import_qiime_sample_data("infant_metadata.tsv") tree <- read_tree_greengenes("tree.nwk") # Convert from multichotomous to dichotmous tree tree <- multi2di(tree) # Combine all information into a single phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) #Look at first 6 rows of metadata head(sample_data(physeq)) #Assign new taxonomic rank column names colnames(tax_table(physeq)) <- c("Kingdom","Phylum", "Class", "Order", "Family", "Genus", "Species") head(tax_table(physeq)) #Filtering by depth sample_sums(physeq) sample_sums(physeq) >= 1000 at_least_1000 <- prune_samples(sample_sums(physeq) >= 1000, physeq) #Differential abundance analysis #Filtering based on metadata infant <- subset_samples(physeq, life_stage == "Infant") infant <- subset_samples(infant, anonymized_name == "70001" || "70014" || "70027") infant <- subset_samples(infant, age_category == "0.5 months" || "6 months") infant <- subset_samples(infant, age_category != "4 months") infant <- subset_samples(infant, age_category != "9 months") infant <- subset_samples(infant, age_category != "2 months") #Removing low-abundant features (step 5) infant_counts <- taxa_sums(infant) relative_abundance_infant <- calculate_relative_abundance(infant_counts) abundant_infant <- relative_abundance_infant > 0.0005 abundant_infant_taxa <- prune_taxa(abundant_infant, infant) #Set taxanomic level (step 6) abundant_infant_genera <- tax_glom(abundant_infant_taxa, taxrank = "Genus") sample_data(abundant_infant_genera)$age_category <- factor (sample_data(abundant_infant_genera)$age_category, levels = c("0.5 months", "6 months")) #DESeq2 analysis NOT WORKING deseq_infant <- phyloseq_to_deseq2(abundant_infant_genera, ~ age_category) geo_means <- apply(counts(deseq_infant), 1, calculate_gm_mean) deseq_infant <- estimateSizeFactors(deseq_infant, geoMeans = geo_means) deseq_infant <- DESeq(deseq_infant, fitType = "local") infant_diff_abund <- results(deseq_infant) #Determine significant results alpha <- 0.05 significant_infant <- as.data.frame(infant_diff_abund) significant_infant <- filter(significant_infant, padj < alpha) #Add taxonomic information genera_df <- as.data.frame(tax_table(abundant_infant_genera)) significant_infant <- merge(significant_infant, genera_df, by = "row.names") significant_infant <- arrange(significant_infant, log2FoldChange) significant_infant <- mutate(significant_infant, Genus = factor(Genus, levels = Genus)) ggplot(significant_infant, aes(x = log2FoldChange, y = Genus)) + geom_bar(stat = "identity") + labs(title = "differential abundant genera - 0.5 vs 6 month", x = "log2 fold-change", y = "Genus") + theme_bw()+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ geom_bar(stat="identity", fill="#F4D7EE", colour="black") #Relative Abundance for Infant Use of Probiotics #Provide custom functions calculate_relative_abundance <- function(x) x / sum(x) calculate_gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x [x > 0]), na.mr = na.rm) / length (x)) } set.seed(711) biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("infant_metadata.tsv") tree <- read_tree_greengenes("tree.nwk") # Convert from multichotomous to dichotmous tree tree <- multi2di(tree) # Combine all information into a single phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) #Assign new taxonomic rank column names colnames(tax_table(physeq)) <- c("Kingdom","Phylum", "Class", "Order", "Family", "Genus", "Species") head(tax_table(physeq)) #Filtering by depth sample_sums(physeq) sample_sums(physeq) >= 1000 at_least_1000 <- prune_samples(sample_sums(physeq) >= 1000, physeq) sample_sums(at_least_1000) #Step 2 filtering by metadata infant <- subset_samples(physeq, life_stage == "Infant") infant <- subset_samples(infant, probiotic_inf != "not collected") #Step 4 Calculate relative abundance infant_RA <- transform_sample_counts(infant, calculate_relative_abundance) #Step 5 remove low abundant taxa infant_counts <- taxa_sums(infant) relative_abundance_infant <- calculate_relative_abundance((infant_counts)) abundant_infant <- relative_abundance_infant > 0.0005 abundant_infant_RA_taxa <- prune_taxa(abundant_infant, infant_RA) #Step 6 Tax glom abundant_infant_RA_genera <- tax_glom(abundant_infant_RA_taxa, taxrank = "Genus") #Choose different genera across code to see appropriate results stenotrophomonas <- subset_taxa(abundant_infant_RA_genera, Genus == "g__Stenotrophomonas") clostridioides <- subset_taxa(abundant_infant_RA_genera, Genus == "g__Clostridioides") akkermansia <- subset_taxa(abundant_infant_RA_genera, Genus == "g__Akkermansia") erysipelatoclostridium <- subset_taxa(abundant_infant_RA_genera, Genus == "g__Erysipelatoclostridium") haemophilus <- subset_taxa(abundant_infant_RA_genera, Genus == "g__Haemophilus") escherichia_shigella <- subset_taxa(abundant_infant_RA_genera, Genus == "g__Escherichia-Shigella") otu_table(haemophilus) #Transfrom data into data frame as.data.frame(otu_table(haemophilus)) #Rearrange data from columns along rows haemophilus_long <- psmelt(haemophilus) #Plot data with ggplot, when bracketed with parenthesis #gets plotted in viewer and saved to variable #Aesthetics define how our data are represented p <- ggplot(haemophilus_long, aes(x = probiotic_inf, y = Abundance, fill = probiotic_inf)) + #Geom define how our data is represented as a geometric shape geom_boxplot() + labs(title = "Relative abundance of Haemophilus", x = "Infant probiotic use", y = "Relative abundance")+ theme_bw()+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank(), axis.line = element_line(colour = "black")) p #checking with smaller scales p + lims(y = c(0, 0.0000001)) #Relative Abundance for Mother Use of Probiotics #Provide custom functions calculate_relative_abundance <- function(x) x / sum(x) calculate_gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x [x > 0]), na.mr = na.rm) / length (x)) } set.seed(711) biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("infant_metadata.tsv") tree <- read_tree_greengenes("tree.nwk") # Convert from multichotomous to dichotmous tree tree <- multi2di(tree) # Combine all information into a single phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) #Assign new taxonomic rank column names colnames(tax_table(physeq)) <- c("Kingdom","Phylum", "Class", "Order", "Family", "Genus", "Species") head(tax_table(physeq)) #Filtering by depth sample_sums(physeq) sample_sums(physeq) >= 1000 at_least_1000 <- prune_samples(sample_sums(physeq) >= 1000, physeq) sample_sums(at_least_1000) #Step 2 filtering by metadata infant <- subset_samples(physeq, life_stage == "Infant") infant <- subset_samples(infant, probiotic_mom != "not collected") #Step 4 Calculate relative abundance infant_RA <- transform_sample_counts(infant, calculate_relative_abundance) #Step 5 remove low abundant taxa infant_counts <- taxa_sums(infant) relative_abundance_infant <- calculate_relative_abundance((infant_counts)) abundant_infant <- relative_abundance_infant > 0.0005 abundant_infant_RA_taxa <- prune_taxa(abundant_infant, infant_RA) #Step 6 Tax glom abundant_infant_RA_genera <- tax_glom(abundant_infant_RA_taxa, taxrank = "Genus") #Choose different genera across code to see appropriate results acinetobacter <- subset_taxa(abundant_infant_RA_genera, Genus == "g__Acinetobacter") collinsella <- subset_taxa(abundant_infant_RA_genera, Genus == "g__Collinsella") akkermansia <- subset_taxa(abundant_infant_RA_genera, Genus == "g__Akkermansia") otu_table(akkermansia) #Transfrom data into data frame as.data.frame(otu_table(akkermansia)) #Rearrange data from columns along rows akkermansia_long <- psmelt(akkermansia) #Plot data with ggplot, when bracketed with parenthesis #gets plotted in viewer and saved to variable #Aesthetics define how our data are represented (p <- ggplot(akkermansia_long, aes(x = probiotic_mom, y = Abundance, fill = probiotic_mom)) + #Geom define how our data is represented as a geometric shape geom_boxplot() + labs(title = "Relative abundance of Akkermansia", x = "Mother probiotic use", y = "Relative abundance"))+ theme_bw()+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank(), axis.line = element_line(colour = "black")) #Relative Abundance for 1-14-27 cohort of similar probiotic use calculate_gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x [x > 0]), na.mr = na.rm) / length (x)) } set.seed(711) biom_file <- import_biom("table-with-taxonomy3.biom") metadata <- import_qiime_sample_data("infant_metadata.tsv") tree <- read_tree_greengenes("tree.nwk") # Convert from multichotomous to dichotmous tree tree <- multi2di(tree) # Combine all information into a single phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) #Assign new taxonomic rank column names colnames(tax_table(physeq)) <- c("Kingdom","Phylum", "Class", "Order", "Family", "Genus", "Species") head(tax_table(physeq)) #Filtering by depth sample_sums(physeq) sample_sums(physeq) >= 1000 at_least_1000 <- prune_samples(sample_sums(physeq) >= 1000, physeq) sample_sums(at_least_1000) #Step 2 filtering by metadata infant <- subset_samples(physeq, life_stage == "Infant") infant <- subset_samples(infant, probiotic_inf != "not collected") infant <- subset_samples(infant, anonymized_name == "70001" || "70014" || "70027") infant <- subset_samples(infant, age_category == "0.5 months" || "6 months") infant <- subset_samples(infant, age_category != "4 months") infant <- subset_samples(infant, age_category != "9 months") infant <- subset_samples(infant, age_category != "2 months") #Step 4 Calculate relative abundance infant_RA <- transform_sample_counts(infant, calculate_relative_abundance) #Step 5 remove low abundant taxa infant_counts <- taxa_sums(infant) relative_abundance_infant <- calculate_relative_abundance((infant_counts)) abundant_infant <- relative_abundance_infant > 0.0005 abundant_infant_RA_taxa <- prune_taxa(abundant_infant, infant_RA) #Step 6 Tax glom abundant_infant_RA_genera <- tax_glom(abundant_infant_RA_taxa, taxrank = "Genus") #Choose different genera across code to see appropriate results clostridioides <- subset_taxa(abundant_infant_RA_genera, Genus == "g__Clostridioides") akkermansia <- subset_taxa(abundant_infant_RA_genera, Genus == "g__Akkermansia") bacteroides <- subset_taxa(abundant_infant_RA_genera, Genus == "g__Bacteroides") bifidobacterium <- subset_taxa(abundant_infant_RA_genera, Genus == "g__Bifidobacterium") clostridium_sensu_stricto_1 <- subset_taxa(abundant_infant_RA_genera, Genus == "g__Clostridium_sensu_stricto_1") otu_table(bacteroides) #Transfrom data into data frame as.data.frame(otu_table(bacteroides)) #Rearrange data from columns along rows bacteroides_long <- psmelt(bacteroides) #Plot data with ggplot, when bracketed with parenthesis #gets plotted in viewer and saved to variable #Aesthetics define how our data are represented p <- ggplot(bacteroides_long, aes(x = age_category, y = Abundance, fill = age_category)) + #Geom define how our data is represented as a geometric shape geom_boxplot() + labs(title = "Relative abundance of Bacteroides", x = "Time point", y = "Relative abundance")+ theme_bw()+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank(), axis.line = element_line(colour = "black")) p #checking with smaller scales p + lims(y = c(0, 0.001))