### QIIME2 Workflow # Team 4 - Seohee An, Priya Gill, Brandon Park, Chelsea Williams # Import data qiime tools import \ --type SampleData[SequencesWithQuality] \ --input-path /mnt/datasets/project_2/soil/soil_manifest.txt \ --output-path demux.qza \ --input-format SingleEndFastqManifestPhred33V2 # Demultiplex qiime demux summarize \ --i-data demux.qza \ --o-visualization demux.qzv # Denoise qiime dada2 denoise-single \ --i-demultiplexed-seqs demux.qza \ --p-trim-left 0 \ --p-trunc-len 150 \ --o-representative-sequences rep-seqs-dada2.qza \ --o-table table-dada2.qza \ --o-denoising-stats stats-dada2.qza qiime metadata tabulate \ --m-input-file stats-dada2.qza \ --o-visualization stats-dada2.qzv mv rep-seqs-dada2.qza rep-seqs.qza mv table-dada2.qza table.qza # Filter 'NA' samples from metadata variables qiime feature-table filter-samples \ --i-table table.qza \ --m-metadata-file /mnt/datasets/project_2/soil/soil_metadata.txt \ --p-where “[Moisture Content]!=’NA’ AND [Total Carbon]!=’NA’ AND [Total Nitrogen]!=’NA’ AND [pH]!=’NA’” --o-filtered-table table_no_NA.qza # Generate feature table qiime feature-table summarize \ --i-table table_no_NA.qza \ --o-visualization table_no_NA.qzv \ --m-sample-metadata-file /mnt/datasets/project_2/soil/soil_metadata.txt qiime feature-table tabulate-seqs \ --i-data rep-seqs.qza \ --o-visualization rep-seqs.qzv # Create phylogenetic tree qiime phylogeny align-to-tree-mafft-fasttree \ --i-sequences rep-seqs.qza \ --o-alignment aligned-rep-seqs.qza \ --o-masked-alignment masked-aligned-rep-seqs.qza \ --o-tree unrooted-tree.qza \ --o-rooted-tree rooted-tree.qza # Core metrics qiime diversity core-metrics-phylogenetic \ --i-phylogeny rooted-tree.qza \ --i-table table_no_NA.qza \ --p-sampling-depth 4800 \ --m-metadata-file /mnt/datasets/project_2/soil/soil_metadata.txt \ --output-dir core-metrics-results # Alpha diversity analysis using the Shannon diversity index qiime diversity alpha-group-significance \ --i-alpha-diversity core-metrics-results/shannon_vector.qza \ --m-metadata-file /mnt/datasets/project_2/soil/soil_metadata.txt \ --o-visualization core-metrics-results/shannon-group-significance.qzv # Alpha rarefaction qiime diversity alpha-rarefaction \ --i-table table_no_NA.qza \ --i-phylogeny rooted-tree.qza \ --p-max-depth 10400 \ --m-metadata-file /mnt/datasets/project_2/soil/soil_metadata.txt \ --o-visualization alpha-rarefaction.qzv # Download Greengenes 99% database wget "ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz" gunzip gg_13_8_otus.tar.gz tar -xvf gg_13_8_otus.tar qiime tools import \ --type 'FeatureData[Sequence]' \ --input-path gg_13_8_otus/rep_set/99_otus.fasta \ --output-path ref-otus.qza qiime tools import \ --type 'FeatureData[Taxonomy]' \ --input-format HeaderlessTSVTaxonomyFormat \ --input-path gg_13_8_otus/taxonomy/99_otu_taxonomy.txt \ --output-path ref-taxonomy.qza # Extract study-specific reference reads qiime feature-classifier extract-reads \ --i-sequences ref-otus.qza \ --p-f-primer AGAGTTTGATCMTGGCTCAG \ --p-r-primer GWATTACCGCGGCKGCTG \ --p-trunc-len 150 \ --o-reads ref-seqs.qza # Train classifier using reference reads qiime feature-classifier fit-classifier-naive-bayes \ --i-reference-reads ref-seqs.qza \ --i-reference-taxonomy ref-taxonomy.qza \ --o-classifier classifier.qza # Use trained classifier to the study sequences (taxonomy analysis) qiime feature-classifier classify-sklearn \ --i-classifier classifier.qza \ --i-reads rep-seqs.qza \ --o-classification taxonomy.qza qiime metadata tabulate \ --m-input-file taxonomy.qza \ --o-visualization taxonomy.qzv qiime taxa barplot \ --i-table table_no_NA.qza \ --i-taxonomy taxonomy.qza \ --m-metadata-file /mnt/datasets/project_2/soil/soil_metadata.txt \ --o-visualization taxa-bar-plots.qzv # ASV-based filtering (removing Archaea, chloroplasts, mitochondria, and ultra-rare ASVs), generating a new table qiime feature-table filter-features \ --i-table table_no_NA.qza \ --p-min-frequency 500 \ --o-filtered-table feature-frequency-filtered-table.qza qiime taxa filter-table \ --i-table table_no_NA.qza \ --i-taxonomy taxonomy.qza \ --p-exclude mitochondria \ --o-filtered-table table-no-mitochondria.qza # Export data from QIIME2 qiime tools export \ --input-path table_no_NA.qza \ --output-path exported qiime tools export \ --input-path taxonomy.qza \ --output-path exported qiime tools export \ --input-path rooted-tree.qza \ --output-path exported # Open file in a text editor on server nano exported/taxonomy.tsv # Edit the column names and change `Feature ID` to `#OTUID` `Taxon` to `taxonomy` `Confidence` to `confidence` biom add-metadata \ -i exported/feature-table.biom \ -o exported/table-with-taxonomy.biom \ --observation-metadata-fp exported/taxonomy.tsv \ --sc-separated taxonomy Correlation_Analysis_R ### Correlation Analysis # Team 4 - Seohee An, Priya Gill, Brandon Park, Chelsea Williams # Loading packages library(tidyverse) library(vegan) library(phyloseq) library(DESeq2) library(ggplot2) # importing all data files biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("soil_metadata.txt") tree <- read_tree_greengenes("tree-soil.nwk") ### Alpha diversity functions # Shannon's diversity shannons = function(x){ present = x[x>0] p = present/sum(present) -sum(p*log(p)) } # Pielou's evennenss function evenness = function(x){ present = x[x>0] p = present/sum(present) -sum(p*log(p))/log(sum(x>0)) } # Richness function richness = function(x){ return(sum(x>0)) } ### Load data biom = import_biom("table-with-taxonomy.biom") taxa_table = otu_table(biom_file) taxonomy = tax_table(biom_file) metadata = read.table("soil_metadata.txt",sep="\t",header=T,row.names = 1) # Select only samples with metadata microbial_samples = colnames(taxa_table) metadata_samples = rownames(metadata) which_metadata = c() for (i in 1:dim(taxa_table)[2]){ which_metadata = c(which_metadata,which(metadata_samples == microbial_samples[i])) } metadata = metadata[which_metadata,] ### Calculate alpha diversity metadata$richness = apply(taxa_table,2,richness) metadata$shannons = apply(taxa_table,2,shannons) metadata$evenness = apply(taxa_table,2,evenness) ### Correlation plots of Shannon's vs metadata variables # pH ggplot(metadata,aes(x=pH,y=shannons)) + geom_point() + labs(x = "pH", y = "Shannon's Diversity") + geom_smooth(method="lm",formula=y~x) + theme_bw(base_size = 15) # Correlation cor.test(metadata$pH,metadata$shannons) # Regression ph_shannons_lm = lm(shannons ~ pH, data = metadata) summary(ph_shannons_lm) # Total Nitrogen ggplot(metadata,aes(x=Total.Nitrogen,y=shannons)) + geom_point() + labs(x = "Total Nitrogen", y = "Shannon's Diversity") + geom_smooth(method="lm",formula=y~x) + theme_bw(base_size = 15) # Correlation cor.test(metadata$Total.Nitrogen,metadata$shannons) # Regression nitrogen_shannons_lm = lm(shannons ~ Total.Nitrogen, data = metadata) summary(nitrogen_shannons_lm) # Total Carbon ggplot(metadata,aes(x=Total.Carbon,y=shannons)) + geom_point() + labs(x = "Total Carbon", y = "Shannon's Diversity") + geom_smooth(method="lm",formula=y~x) + theme_bw(base_size = 15) # Correlation cor.test(metadata$Total.Carbon,metadata$shannons) # Regression carbon_shannons_lm = lm(shannons ~ Total.Carbon, data = metadata) summary(carbon_shannons_lm) # Moisture Content ggplot(metadata,aes(x=Moisture.Content,y=shannons)) + geom_point() + labs(x = "Moisture Content", y = "Shannon's Diversity") + geom_smooth(method="lm",formula=y~x) + theme_bw(base_size = 15) # Correlation cor.test(metadata$Moisture.Content,metadata$shannons) # Regression moisture_shannons_lm = lm(shannons ~ Moisture.Content, data = metadata) summary(moisture_shannons_lm) ### Correlation plots of Shannon's Diversity vs metadata variables by LTSP # pH ggplot(metadata,aes(x=pH,y=shannons)) + facet_grid(~LTSP.Treatment) + geom_point() + labs(x = "pH", y = "Shannon's Diversity") + geom_smooth(method="lm",formula=y~x) + theme_bw(base_size = 15) ggplot(metadata,aes(x=pH,y=shannons,group=LTSP.Treatment,color=LTSP.Treatment)) + geom_point() + labs(x = "pH", y = "Shannon's Diversity") + geom_smooth(method="lm",formula=y~x) + theme_bw(base_size = 15) for (g in unique(metadata$LTSP.Treatment)){ print(paste("LTSP Treatment =",g)) temp_metadata = filter(metadata,LTSP.Treatment==g) # Correlation temp_cor = cor.test(temp_metadata$pH,temp_metadata$shannons) print(paste("Correlation =",temp_cor$estimate, "p value =",temp_cor$p.value)) # Regression om_ph_shannons_lm = lm(shannons ~ pH, data = temp_metadata) print(paste("Regression slope =",summary(om_ph_shannons_lm)$coefficients[2,1], "p value =",summary(om_ph_shannons_lm)$coefficients[2,4])) } # Total Nitrogen ggplot(metadata,aes(x=Total.Nitrogen,y=shannons)) + facet_grid(~LTSP.Treatment) + geom_point() + labs(x = "Total Nitrogen", y = "Shannon's Diversity") + geom_smooth(method="lm",formula=y~x) + theme_bw(base_size = 15) ggplot(metadata,aes(x=Total.Nitrogen,y=shannons,group=LTSP.Treatment,color=LTSP.Treatment)) + geom_point() + labs(x = "Total Nitrogen", y = "Shannon's Diversity") + geom_smooth(method="lm",formula=y~x) + theme_bw(base_size = 15) for (g in unique(metadata$LTSP.Treatment)){ print(paste("LTSP Treatment =",g)) temp_metadata = filter(metadata,LTSP.Treatment==g) # Correlation temp_cor = cor.test(temp_metadata$Total.Nitrogen,temp_metadata$shannons) print(paste("Correlation =",temp_cor$estimate, "p value =",temp_cor$p.value)) # Regression om_nitrogen_shannons_lm = lm(shannons ~ Total.Nitrogen, data = temp_metadata) print(paste("Regression slope =",summary(om_nitrogen_shannons_lm)$coefficients[2,1], "p value =",summary(om_nitrogen_shannons_lm)$coefficients[2,4])) } # Total Carbon ggplot(metadata,aes(x=Total.Carbon,y=shannons)) + facet_grid(~LTSP.Treatment) + geom_point() + labs(x = "Total Carbon", y = "Shannon's Diversity") + geom_smooth(method="lm",formula=y~x) + theme_bw(base_size = 15) ggplot(metadata,aes(x=Total.Carbon,y=shannons,group=LTSP.Treatment,color=LTSP.Treatment)) + geom_point() + labs(x = "Total Carbon", y = "Shannon's Diversity") + geom_smooth(method="lm",formula=y~x) + theme_bw(base_size = 15) for (g in unique(metadata$LTSP.Treatment)){ print(paste("LTSP Treatment =",g)) temp_metadata = filter(metadata,LTSP.Treatment==g) # Correlation temp_cor = cor.test(temp_metadata$Total.Carbon,temp_metadata$shannons) print(paste("Correlation =",temp_cor$estimate, "p value =",temp_cor$p.value)) # Regression om_carbon_shannons_lm = lm(shannons ~ Total.Carbon, data = temp_metadata) print(paste("Regression slope =",summary(om_carbon_shannons_lm)$coefficients[2,1], "p value =",summary(om_carbon_shannons_lm)$coefficients[2,4])) } # Moisture Content ggplot(metadata,aes(x=Moisture.Content,y=shannons)) + facet_grid(~LTSP.Treatment) + geom_point() + labs(x = "Moisture Content", y = "Shannon's Diversity") + geom_smooth(method="lm",formula=y~x) + theme_bw(base_size = 15) ggplot(metadata,aes(x=Moisture.Content,y=shannons,group=LTSP.Treatment,color=LTSP.Treatment)) + geom_point() + labs(x = "Moisture Content", y = "Shannon's Diversity") + geom_smooth(method="lm",formula=y~x) + theme_bw(base_size = 15) for (g in unique(metadata$LTSP.Treatment)){ print(paste("LTSP Treatment =",g)) temp_metadata = filter(metadata,LTSP.Treatment==g) # Correlation temp_cor = cor.test(temp_metadata$Moisture.Content,temp_metadata$shannons) print(paste("Correlation =",temp_cor$estimate, "p value =",temp_cor$p.value)) # Regression om_moisture_shannons_lm = lm(shannons ~ Moisture.Content, data = temp_metadata) print(paste("Regression slope =",summary(om_moisture_shannons_lm)$coefficients[2,1], "p value =",summary(om_moisture_shannons_lm)$coefficients[2,4])) } ### Differential Abundance Analysis # Team 4 - Seohee An, Priya Gill, Brandon Park, Chelsea Williams # Load CRAN packages library(tidyverse) library(vegan) # Load Bioconductor packages library(phyloseq) library(DESeq2) # Calculate relative abundance calculate_RA <- function(x) x/sum(x) # Helper functions gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x)) } # Export biom file and tree from QIIME2 and provide original metadata file biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("soil_metadata.txt") tree <- read_tree_greengenes("tree-soil.nwk") # Combine all information into a single phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) # Convert taxonomic rank from numbers to proper names colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species") # Keep only abundant ASVs # First determine counts of ASV across all samples total_counts <- taxa_sums(physeq) # Calculate relative abundance for each ASV relative_abundance <- calculate_RA(total_counts) # Determine which ASVs are more abundant than 0.1% abundant <- relative_abundance > 0.001 abundant_taxa <- prune_taxa(abundant, physeq) # Check the resulting new phyloseq object with much fewer taxa abundant_taxa # Phyloseq object is called "Class" class <- tax_glom(abundant_taxa, taxrank = "Class", NArm = FALSE) class # Only keep OM1 samples OM1class <- subset_samples(class, LTSP.Treatment == "OM1") OM1class # Limit to OM1 samples that were filtered deseq_feature <- phyloseq_to_deseq2(OM1class, ~ LTSP.Treatment) geo_means <- apply(counts(deseq_feature), 1, gm_mean) deseq_feature <- estimateSizeFactors(deseq_feature, geoMeans = geo_means) deseq_feature <- DESeq(deseq_feature, fitType="local") feature_diff_abund <- results(deseq_feature) # Define cutoff for the adjusted p-value alpha <- 0.05 # Reformat information as data frame including feature as variable significant_feature <- as_tibble(feature_diff_abund, rownames = "feature") # Keep only significant results and sort by adjusted p-value significant_feature <- filter(significant_feature, padj < alpha) significant_feature <- arrange(significant_feature, padj) # Get the taxonomic information as a data frame taxa_df <- as_tibble(as.data.frame(tax_table(physeq)), rownames = "feature") # Combine the significant features with taxonomic classification significant_feature <- inner_join(significant_feature, taxa_df) # There 14 features that are different at FDR-corrected p < 0.05! dim(significant_feature) # Plot differential abundance significant_feature %>% ggplot(aes(x = log2FoldChange, y = Class)) + geom_col() + xlab("Log2 Fold Change") + # Label for y-axis ylab("Class") + theme_bw(base_size = 16) # Convert to relative abundance and define taxrank as Class physeq_RA <- transform_sample_counts(physeq, calculate_RA) species_RA <- tax_glom(physeq_RA, taxrank = "Class", NArm = FALSE) # Graph actinobacteria and filter for that class with subset_taxa actinobacteria <- subset_taxa(species_RA, Class == "c__Actinobacteria") # Plot data by changing the table from wide to long format actinobacteria_melt <- psmelt(actinobacteria) # Plot with ggplot function # Define what dataset to plot and what variable on x-axis and y-axis ggplot(actinobacteria_melt, aes(x = as.factor(LTSP.Treatment), y = Abundance)) + # Visualize data as Boxplot geom_boxplot(aes(fill = as.factor(LTSP.Treatment))) + # Label for x-axis xlab("LTSP Treatment") + # Label for y-axis ylab("Relative Abundance") + # Plot title ggtitle("The Relative Abundance of Actinobacteria") + # colors for the different groups scale_fill_manual(values=c("gray42", "gray42", "gray42", "gray42")) + guides(fill = FALSE) + # Set a standard theme for plot theme_bw(base_size = 16) ### Beta Diversity Analysis # Team 4 - Seohee An, Priya Gill, Brandon Park, Chelsea Williams # Loading packages library(tidyverse) library(vegan) library(phyloseq) library(DESeq2) # Helper functions gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x)) } # Importing all data files biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("soil_metadata.txt") tree <- read_tree_greengenes("tree-soil.nwk") # Combine all objects into phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) # Overview of phyloseq object physeq # Use sample_data() to look at metadata of phyloseq object # Look only at first 6 lines of table with head() head(sample_data(physeq)) # Set set of random numbers set.seed(600) # taxonomic rank names rank_names(physeq) # Rename column names for taxonomic ranks colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species") rank_names(physeq) # Filter data based on metadata category with == OM1 <- subset_samples(physeq, LTSP.Treatment == "OM1") OM1 # Filter out data based on metadata category with != no_OM1 <- subset_samples(physeq, LTSP.Treatment != "OM1") no_OM1 # format data as table with as.data.frame() as.data.frame(sample_names(OM1)) # Show read count for each sample using sample_sums() as.data.frame(sample_sums(OM1)) # Exclude samples with less than 7000 reads OM1_7000 <- prune_samples(sample_sums(OM1) >= 7000, OM1) OM1_7000 # Beta diversity PCoA plot # Diversity requires rarefied taxa tables physeq_rar <- rarefy_even_depth(physeq, sample.size = 4800) # Convert to RA (relative abundance) physeq_rar_RA <- transform_sample_counts(physeq_rar, function(x) x/sum(x)) # Define ordinate() and set method = PCoA and use weighted unifrac metric ord <- ordinate(physeq_rar_RA, method = "PCoA", distance = "wunifrac") # Plot data plot_ordination(physeq_rar_RA, ord, type = "sample", color = "LTSP.Treatment", title = "PCoA (Weighted Unifrac)") + # Adding text to plot annotate(geom = "text", label = ".", x = - 0.025, y = 0.025, size = 4) + # Manually adjust colours for points scale_colour_manual(values = c("red", "orange", "forestgreen", "blue"), labels = c("OM1", "OM2", "OM3", "REF")) + stat_ellipse(type = "norm", size = 1) + guides(colour = guide_legend("LTSP Treatment")) + theme_bw(base_size = 14)