#!/bin/bash ############################## # user credentials for the server: # User: root # IP: 10.19.139.138 # Password: Biome625 ############################## Create working directory in server and import data ############################## # move into the data folder of the server cd data # create directory for project and move into it mkdir soil_project cd soil_project # import data into working directory (Make sure you are in the soil_project directory) # this commands generates the demux.qza file qiime tools import \ --type 'SampleData[SequencesWithQuality]' \ --input-path /mnt/datasets/project_2/soil/soil_manifest.txt \ --output-path demux.qza \ --input-format SingleEndFastqManifestPhred33V2 # create visualization for demux.qza file and visualize it qiime demux summarize \ --i-data demux.qza \ --o-visualization demux.qzv #copy metadata file from mnt folder to working directory cp /mnt/datasets/project_2/soil/soil_metadata.txt soil_metadata.txt # determine ASVs with DADA2 qiime dada2 denoise-single \ --i-demultiplexed-seqs demux.qza \ --p-trim-left 0 \ --p-trunc-len 150 \ --o-representative-sequences rep-seqs.qza \ --o-table table.qza \ --o-denoising-stats stats.qza #visualize ASV statistics qiime feature-table summarize \ --i-table table.qza \ --o-visualization table.qzv \ --m-sample-metadata-file soil_metadata.txt ############################## Do metadata filtering using qiime filter-table, filter-features and filter-samples command and generate table.qzv visualization ############################## #Filter out low frequency reads qiime feature-table filter-features \ --i-table table.qza \ --p-min-frequency 10 \ --o-filtered-table frequency_filter_table.qza #Filter by metadata samples and select for samples only in BC qiime feature-table filter-samples \ --i-table frequency_filter_table.qza --m-metadata-file soil_metadata.txt \ --p-where "[Region] IN ('British Columbia')" \ --o-filtered-table freqeuncy_filter_BC_table.qza # generate the visualization for the table.qza qiime feature-table summarize \ --i-table frequency_filter_BC_table.qza \ --o-visualization filtered_table.qzv \ --m-sample-metadata-file soil_metadata.txt ############################## Generate tree for phylogenetic analyses using the qiime-phylogeny command (rooted-tree.qza and unrooted-tree.qza) ############################## qiime phylogeny align-to-tree-mafft-fasttree \ --i-sequences rep-seqs.qza \ --o-alignment aligned-rep-seqs.qza \ --o-masked-alignment masked-aligned-rep-seqs.qza \ --o-tree unrooted-tree.qza \ --o-rooted-tree rooted-tree.qza ############################## Generate alpha rarefaction visualization using the qiime diversity command. ############################## #Using the visualization, determine if the diversity for OM treatments is able to plateau and at which sequencing #depth. Select an adequate sequencing depth to be used in downstream alpha and beta diversity plots. # alpha rarefaction only including BC qiime diversity alpha-rarefaction \ --i-table frequency_filter_BC_table.qza \ --i-phylogeny rooted-tree.qza \ --p-max-depth 9276 \ --m-metadata-file soil_metadata.txt \ --o-visualization alpha-rarefaction.qzv # alpha-rarefaction including all regions qiime diversity alpha-rarefaction \ --i-table frequency_filter_table.qza \ --i-phylogeny rooted-tree.qza \ --p-max-depth 9276 \ --m-metadata-file soil_metadata.txt \ --o-visualization alpha-rarefaction_all_regions.qzv #Ran the rarefaction again but with a different max-depth qiime diversity alpha-rarefaction \ --i-table frequency_filter_BC_table.qza \ --i-phylogeny rooted-tree.qza \ --p-max-depth 6000 \ --m-metadata-file soil_metadata.txt \ --o-visualization alpha-rarefaction.qzv ############################## Training a taxonomic classifier and classifying ASVs ############################## # dowload the Greenges 97% version Database wget "ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz" #Unzip the database gunzip gg_13_8_otus.tar.gz tar -xvf gg_13_8_otus.tar #obtain 16S sequences qiime tools import \ --type 'FeatureData[Sequence]' \ --input-path gg_13_8_otus/rep_set/97_otus.fasta \ --output-path ref-otus.qza #obtain the associated taxonomic information qiime tools import \ --type 'FeatureData[Taxonomy]' \ --input-format HeaderlessTSVTaxonomyFormat \ --input-path gg_13_8_otus/taxonomy/97_otu_taxonomy.txt \ --output-path ref-taxonomy.qza # extract the sequences between the primers from the 16S sequences in the database qiime feature-classifier extract-reads \ --i-sequences ref-otus.qza \ --p-f-primer AGAGTTTGATCMTGGCTCAG \ --p-r-primer GWATTACCGCGGCKGCTG \ --p-trunc-len 150 \ --o-reads ref-seqs.qza # Train classifier based on the 16S reads qiime feature-classifier fit-classifier-naive-bayes \ --i-reference-reads ref-seqs.qza \ --i-reference-taxonomy ref-taxonomy.qza \ --o-classifier classifier.qza # classify your ASVs qiime feature-classifier classify-sklearn \ --i-classifier classifier.qza \ --i-reads rep-seqs.qza \ --o-classification taxonomy.qza # generate visualization of the taxonomy file qiime metadata tabulate \ --m-input-file taxonomy.qza \ --o-visualization taxonomy.qzv # generate taxonomy barplot qiime taxa barplot \ --i-table table.qza \ --i-taxonomy taxonomy.qza \ --m-metadata-file soil_metadata.txt \ --o-visualization taxa-bar-plots.qzv ############################## Filter mitochondrial and chloroplast sequences using defined taxonomy classification ############################## # filter mitochondrial and chloroplast sequences qiime taxa filter-table \ --i-table frequency_filter_BC_table.qza \ --i-taxonomy taxonomy.qza \ --p-exclude mitochondria,chloroplast \ --o-filtered-table frequency_filter_BC_no_mitoc_chloroplast_table.qza ############################## Generate the core metric directory (calculate alpha- and beta-diversity metrics) with filtered table and selected sequencing depth ############################## # command for core-metrics-results directory qiime diversity core-metrics-phylogenetic \ --i-phylogeny rooted-tree.qza \ --i-table frequency_filter_BC_no_mitoc_chloroplast_table.qza \ --p-sampling-depth 6041 \ --m-metadata-file soil_metadata.txt \ --output-dir core-metrics-results # calculate beta-diversity significance for unweighted Unifrac based on LTSP treatment qiime diversity beta-group-significance \ --i-distance-matrix core-metrics-results/unweighted_unifrac_distance_matrix.qza \ --m-metadata-file soil_metadata.txt \ --m-metadata-column "LTSP Treatment" \ --o-visualization core-metrics-results/unweighted-unifrac-LTSP-group-significance.qzv \ --p-pairwise # calculate beta-diversity significance for weighted Unifrac based on LTSP treatment qiime diversity beta-group-significance \ --i-distance-matrix core-metrics-results/weighted_unifrac_distance_matrix.qza \ --m-metadata-file soil_metadata.txt \ --m-metadata-column "LTSP Treatment" \ --o-visualization core-metrics-results/weighted-unifrac-LTSP-group-significance.qzv \ --p-pairwise # calculate alpha diversity metrics for the data # calculate Faiths alpha diversity metric qiime diversity alpha-group-significance \ --i-alpha-diversity core-metrics-results/faith_pd_vector.qza \ --m-metadata-file soil_metadata.txt \ --o-visualization core-metrics-results/faith-pd-group-significance.qzv # calculate Pielou's eveness metric qiime diversity alpha-group-significance \ --i-alpha-diversity core-metrics-results/evenness_vector.qza \ --m-metadata-file soil_metadata.txt \ --o-visualization core-metrics-results/evenness-group-significance.qzv # calculate total observed features (ASVs) metric qiime diversity alpha-group-significance \ --i-alpha-diversity core-metrics-results/observed_features_vector.qza \ --m-metadata-file soil_metadata.txt \ --o-visualization core-metrics-results/observed_features_significance.qzv # calculate shannon diversity metric qiime diversity alpha-group-significance \ --i-alpha-diversity core-metrics-results/shannon_vector.qza \ --m-metadata-file soil_metadata.txt \ --o-visualization core-metrics-results/shannon_significance.qzv ############################## Export QIIME2 files and combine files into a .biom file ############################## #make new directory for exported BIOM files mkdir exported #export BIOM files qiime tools export \ --input-path table.qza \ --output-path exported qiime tools export \ --input-path taxonomy.qza \ --output-path exported qiime tools export --input-path rooted-tree.qza \ --output-path exported #open file in text editor on server nano exported/taxonomy.tsv #edit the column names and change #'Feature ID' to '#OTUID' #'Taxon' to 'taxonomy' #'Confidence' to 'confidence' #combine files into .biom file biom add-metadata \ -i exported/feature-table.biom \ -o exported/table-with-taxonomy.biom \ --observation-metadata-fp exported/taxonomy.tsv \ --sc-separated taxonomy --- output: pdf_document: default html_document: default --- # Load all the packages needed: ```{r} library(readr) library(dplyr) library(tidyverse) library(readxl) library(lubridate) library(chron) library(ggsci) library(grid) library(gridExtra) ``` # 1. Load the metadata from soil ```{r} soil_metadata <- read_excel("soil_metadata_reformatted.xlsx", na = "NA") %>% mutate(LTSP = as.factor(LTSP)) %>% mutate(Elevation = as.factor(Elevation)) soil_metadata ``` # 2. look at the different regions (were samples were taken from) in relationship to Carbon and Nitrogen ```{r} soil_metadata %>% ggplot(aes(x = Total_Carbon, y = Total_Nitrogen, color = Region)) + geom_jitter() + labs(x ="Total Carbon", y = "Total Nitrogen") ``` # filter to only include samples in British Columbia and looks at Total Carbon and Total nitrogen in relationship to LTSP treatment (REF, OM1, OM2, OM3) ```{r} soil_metadata %>% filter(Region == "British Columbia") %>% ggplot(aes(x = Total_Carbon, y = Total_Nitrogen, color = LTSP)) + geom_jitter(width = 0.5, height = 0.1) + labs(x ="Total Carbon", y = "Total Nitrogen") ``` # filter data to include only samples from one OM1,2 or Ref and compare it to OM3 (this is so that we can easily visualize if there are any visible distinctions) Comparison between Ref and OM3 ```{r} soil_metadata %>% filter(Region == "British Columbia") %>% filter(LTSP %in% c("REF", "OM3")) %>% ggplot(aes(x = Total_Carbon, y = Total_Nitrogen, color = LTSP)) + geom_jitter(width = 0.5, height = 0.1) + labs(x ="Total Carbon", y = "Total Nitrogen") ``` Comparison between OM1 and OM3 ```{r} soil_metadata %>% filter(Region == "British Columbia") %>% filter(LTSP %in% c("OM1", "OM3")) %>% ggplot(aes(x = Total_Carbon, y = Total_Nitrogen, color = LTSP)) + geom_jitter(width = 0.5, height = 0.1) + labs(x ="Total Carbon", y = "Total Nitrogen") ``` Comparison between OM1 and OM3 ```{r} soil_metadata %>% filter(Region == "British Columbia") %>% filter(LTSP %in% c("OM2", "OM3")) %>% ggplot(aes(x = Total_Carbon, y = Total_Nitrogen, color = LTSP)) + geom_jitter(width = 0.5, height = 0.1) + labs(x ="Total Carbon", y = "Total Nitrogen") ``` ```{r} soil_metadata %>% filter(Region == "British Columbia") %>% ggplot(aes(x = Moisture_content, y = Soil_Bulk_Density, color = LTSP)) + geom_jitter() + labs(x ="moisture content", y = "Soil bulk density") ``` # look at total carbon and total nitrogen in relationship to soil horizon Comparison between OM1 and OM3 ```{r} soil_metadata %>% filter(Region == "British Columbia") %>% ggplot(aes(x = Total_Carbon, y = Total_Nitrogen, color = Horizon)) + geom_jitter(width = 0.5, height = 0.1) + facet_wrap(~ LTSP) + labs(x ="Total Carbon", y = "Total Nitrogen") ``` # Load CRAN packages ```{r} library(tidyverse) library(vegan) ``` # Load Bioconductor packages ```{r} library(phyloseq) library(DESeq2) library(tidyr) library(ape) ``` **Note: Before doing the following analyses, make sure that you have exported the table.qza, taxonomy.qza, metadata and rooted-tree files into you working directory** - for instructions on how to do this, refer to the QIIME2 working script # Export biom file and tree from QIIME2 and provide original metadata file ```{r} biom_file <- import_biom("exported/table-with-taxonomy.biom") metadata <- import_qiime_sample_data("soil_metadata.txt") tree <- read_tree_greengenes("exported/tree.nwk") tree <- multi2di(tree) # assuming you read in your tree.nwk file as the variable tree ``` # Combine all information into a single phyloseq object ```{r} physeq <- merge_phyloseq(biom_file, metadata, tree) ``` ```{r} # look at sample metadata head(sample_data(physeq)) # downstream steps may use randomizers so we set.seet for reproducibility set.seed(711) # taxonomic rank names rank_names(physeq) #kingdom class and all of that, however you can change the name # rename the ranks colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species") head(tax_table(physeq)) ``` # filter data based on metadata category (British Columbia) ```{r} # filter data on metadata filtered_phyloseq <- subset_samples(physeq, Region == "British Columbia") filtered_phyloseq #format data as table with as.data.frame() as.data.frame(sample_names(filtered_phyloseq)) #show read count for each sampl ``` # beta diversity plots ```{r} # diversity analysis requires rarefied taxa tables # we know from Qiime that we should rarefy to 6041, so right palm samples level out physeq_rar <- rarefy_even_depth(filtered_phyloseq, sample.size = 6041) # convert to RA (relative abundance) physeq_rar_RA <- transform_sample_counts(physeq_rar, function(x) x/sum(x)) # use our data to do beta-diversity analysis # we define the type of analysis using the ordinate() function and setting the method = PCoA, setting distance to type # of beta diversity analysis # you can change analysis to jaccard, bray-curtis and so on ord <- ordinate(physeq_rar_RA, method = "PCoA", distance = "wunifrac") # plot the data based on OM removal treatments and soil horizon plot_ordination(physeq_rar_RA, ord, type = "sample", color = "LTSP.Treatment", title = "PCoA (weighted UniFrac)", shape = "Horizon") + # manually adjust colors for points scale_colour_manual(values = c("blue", "red", "black", "orange")) + #labels = c("REF", "Left palm", "Right palm", "Tongue")) + # change the labels # add elipse automatically stat_ellipse(type = "norm", size = 1) + # how to change the legend title guides(color = guide_legend("LTSP Treatment")) + # add theme theme_bw(base_size = 14) ``` # plot the data based on OM treatment == "REF" ```{r} # plot the data based on OM treatment == "REF" plot_ordination(subset_samples(physeq_rar_RA, LTSP.Treatment == "REF"), ord, type = "sample", color = "LTSP.Treatment", title = "PCoA (weighted UniFrac)", shape = "Horizon") + # manually adjust colors for points scale_colour_manual(values = c("orange")) + # add elipse automatically stat_ellipse(type = "norm", size = 1) + # how to change the legend title guides(color = guide_legend("LTSP Treatment")) + # add theme theme_bw(base_size = 14) ``` # plot the data based on OM treatment == "OM1" ```{r} # plot the data based on OM treatment == "REF" plot_ordination(subset_samples(physeq_rar_RA, LTSP.Treatment == "OM1"), ord, type = "sample", color = "LTSP.Treatment", title = "PCoA (weighted UniFrac)", shape = "Horizon") + # manually adjust colors for points scale_colour_manual(values = c("blue")) + # add elipse automatically stat_ellipse(type = "norm", size = 1) + # how to change the legend title guides(color = guide_legend("LTSP Treatment")) + # add theme theme_bw(base_size = 14) ``` # plot the data based on OM treatment == "OM2" ```{r} # plot the data based on OM treatment == "REF" plot_ordination(subset_samples(physeq_rar_RA, LTSP.Treatment == "OM2"), ord, type = "sample", color = "LTSP.Treatment", title = "PCoA (weighted UniFrac)", shape = "Horizon") + # manually adjust colors for points scale_colour_manual(values = c("red")) + # add elipse automatically stat_ellipse(type = "norm", size = 1) + # how to change the legend title guides(color = guide_legend("LTSP Treatment")) + # add theme theme_bw(base_size = 14) ``` # plot the data based on OM treatment == "OM3" ```{r} # plot the data based on OM treatment == "REF" plot_ordination(subset_samples(physeq_rar_RA, LTSP.Treatment == "OM3"), ord, type = "sample", color = "LTSP.Treatment", title = "PCoA (weighted UniFrac)", shape = "Horizon") + # manually adjust colors for points scale_colour_manual(values = c("black")) + # add elipse automatically stat_ellipse(type = "norm", size = 1) + # how to change the legend title guides(color = guide_legend("LTSP Treatment")) + # add theme theme_bw(base_size = 14) ``` # plot the data based on other variables (Mean annual precipitation) ```{r} # plot the data physeq_rar_RA %>% plot_ordination(ord, type = "sample", color = "Mean.Annual.Precipitation..mm.", title = "PCoA (weighted UniFrac)") + # how to change the legend title guides(color = guide_legend("Mean annual precipitation")) + # add theme theme_bw(base_size = 14) ``` # plot the data based on other variables (Ecozone) ```{r} # plot the data physeq_rar_RA %>% plot_ordination(ord, type = "sample", color = "Ecozone", title = "PCoA (weighted UniFrac)") + # how to change the legend title guides(color = guide_legend("Ecozone")) + # add theme theme_bw(base_size = 14) ``` # plot the data based on other variables (Ecozone) ```{r} # plot the data physeq_rar_RA %>% plot_ordination(ord, type = "sample", color = "Site", title = "PCoA (weighted UniFrac)") + # how to change the legend title guides(color = guide_legend("Ecozone")) + # add theme theme_bw(base_size = 14) ``` #PCoA plot of all the regions (BC, Texas, Ontario, California; with no filters) ```{r} # diversity analysis requires rarefied taxa tables # we know from Qiime that we should rarefy to 6041, so right palm samples level out physeq_rar <- rarefy_even_depth(physeq, sample.size = 6000) # convert to RA (relative abundance) physeq_rar_RA <- transform_sample_counts(physeq_rar, function(x) x/sum(x)) # use our data to do beta-diversity analysis # we define the type of analysis using the ordinate() function and setting the method = PCoA, setting distance to type # of beta diversity analysis # you can change analysis to jaccard, bray-curtis and so on ord <- ordinate(physeq_rar_RA, method = "PCoA", distance = "wunifrac") # plot the data plot_ordination(physeq_rar_RA, ord, type = "sample", color = "Region", title = "PCoA (weighted UniFrac)") + # manually adjust colors for points scale_colour_manual(values = c("blue", "red", "black", "orange")) + # add elipse automatically stat_ellipse(type = "norm", size = 1) + # how to change the legend title guides(color = guide_legend("Region")) #+ # add theme #theme_bw(base_size = 14) ``` In this script we will look at doing logistical regression analysis of alpha-diversity metrics to control for confounding variables that may be affecting the diversity. 1. We load the data from alpha diversity metrics donwloaded from the QIIME2 view online sofware. We do this by inputing the resultinf files from the 'alpha-diversity significance' commands in QIIME2 into the QIIME2 view software. (Refer to the QIIME2 working script for more details on this) - Shannon diversity - observed OTUs - Pielous eveness - Faith metric we do this by doing putting the alpha diversity .qzv files that we previously calculated (see pg X of lab book)into qiime view online software and then downloading the results in TSVs formats and loading them into R to start, we first load the necessary libraries ```{r} library(dplyr) library(tidyr) library(ggplot2) library(tidyverse) ``` **now we load the data into R and merge the different alpha diversity metrics and the metadata into one table object** NOTE: the .tsv files downlaoded from the qiime table already contain some categorical columns, but they omit numerical ones. In our regression analysis we are going to use both of them. Therefore we need to load the metadata and add some of this numerical coumns into our final object ```{r} # load each of the tables independently shannon <- read_tsv("shannon.tsv") # we take out the first row which have the categorizations based on qiime observed <- read_tsv("observed_features.tsv") faiths <- read_tsv("faiths.tsv") evenness <- read_tsv("evenness.tsv") # load the metadata metadata <- read_csv("soil_metadata.csv") %>% rename(id = `#SampleID`) # rename the #SampleID column to a column name that we can math to other data frames #merge the tables into a table that contains the the alpha-diversity metrics and their sample id alpha_metrics <- bind_cols(shannon[,c(1, 18)], observed[,18], faiths[,18], evenness[,18]) %>% as.data.frame() # this converts the table into a data.frame # join the alpha_metrics with the metadata based on their sample id alpha_metrics <- right_join(metadata, alpha_metrics, by = "id") ``` 2. now using the weighted unifract beta diversity plot, we highligh the variables that have clustering patterns and we also look at which of them are highly correlated (or have the same clustering patterns), such that we only account for one of them in our data **we look at the structure of the data and see if it has been categorized corrrectly** ```{r} str(alpha_metrics) ``` **We need to do two things to prepare this data for analysis** 1. we need to filter the columns that will be useful in our analysis. The filtering parameters are as follows: - They need to have more than one group in the variable - There cannot be two variables that are highly correlated with each other (we check this using the beta diversity plots and the alias() function in R) - we cannot use data with missing values in our model since these values will be dropped 2. change the columns that are read as "chr" types to "factor" ```{r} # we are now selecting for only the metrics that we will be using in our analysis using the select() function alpha_metrics <- alpha_metrics %>% select(Ecozone, #where samples were collected (same as climatic zone) `Collection Date`, #the date they were collected Site, #the place in BC they were collected (same as with Tree cover, longitude, latitude, and elevation) Horizon, # the soil horizon (same as sampling depth) `LTSP Treatment`, # the LTSP treatment `Mean Annual Temperature (Celsius)`, # The mean annual temperature of the colleciton site `Soil Classification`, # the type of soil `Compaction Treatment`, # how the soild was compacted pH, # note that there are missing values! `Total Carbon`, # note that there are missing values! `Total Nitrogen`, # note that there are missing values! `Soil Bulk Density`, # note that there are missing values! `CN Ratio`, # note that there are missing values! `Moisture Content`, # No missing values!!! shannon_entropy, observed_features, pielou_evenness, faith_pd) # now we change the data type in each column to represent "chr" columns as "factor" type alpha_metrics <- alpha_metrics %>% mutate(Ecozone = as.factor(Ecozone), # 2 ezocones `Collection Date` = as.factor(`Collection Date`), Horizon = as.factor(Horizon), `LTSP Treatment` = as.factor(`LTSP Treatment`), Site = as.factor(Site), # 6 collection "Site" `Mean Annual Temperature (Celsius)` = as.factor(`Mean Annual Temperature (Celsius)`), `Compaction Treatment` = as.factor(`Compaction Treatment`), `Soil Classification` = as.factor(`Soil Classification`) ) #we will also rename some variable to make them easier to manage alpha_metrics <- alpha_metrics %>% rename(Date = `Collection Date`, Treatment = `LTSP Treatment`, Annual_temperature = `Mean Annual Temperature (Celsius)`, Soil_classification = `Soil Classification`, Compaction = `Compaction Treatment`) # we check to see that we corrected the types and renamed the columns correctly str(alpha_metrics) ``` 3. now we can do our logistical regression analysis of the alpha-diversity metrics - shannon diversity with only categorical variables! (we will use the glm() and summary() R based functions to do this) ```{r} # calculate the shannon regression and the summary shannon_regression <- glm(shannon_entropy ~ Ecozone + Horizon + Treatment + Site + Compaction + `Moisture Content`, data = alpha_metrics) summary(shannon_regression) ``` Calculate the R^2 value of the shannon model ```{r} # calculate the R^2 value of the model ll.null <- shannon_regression$null.deviance/-2 ll.proposed <- shannon_regression$deviance/-2 (ll.null - ll.proposed)/ll.null ``` Plot the data for shannon diversity ```{r} # create a new data frame object containing the predicted alpha diversity values and metadata categories predicted.data.shannon <- data.frame( probability.of.shannon = predict(shannon_regression, type = "response"), shannon_entropy = alpha_metrics$shannon_entropy, Ecozone = alpha_metrics$Ecozone, Horizon = alpha_metrics$Horizon, Treatment = alpha_metrics$Treatment, Site = alpha_metrics$Site, Compaction = alpha_metrics$Compaction, Temperature = alpha_metrics$Annual_temperature, Date = alpha_metrics$Date, pH = alpha_metrics$pH, Carbon = alpha_metrics$`Total Carbon`, Nitrogen = alpha_metrics$`Total Nitrogen`, Moisture = alpha_metrics$`Moisture Content` ) # rearrange the LTSP treatments to order the plot predicted.data.shannon$Treatment <- factor(predicted.data.shannon$Treatment, levels = c("REF","OM1", "OM2", "OM3")) # plot Treatment vs the predicted values ggplot(predicted.data.shannon, aes(x= Treatment, y= probability.of.shannon)) + geom_boxplot(color = "black",fill='#A4A4A4') + geom_jitter(position = position_jitter(0.2)) + scale_y_continuous(limits = c(5.4, 7.0)) + labs(x = "LTSP treatment", y = "Corrected Shannon Value") + theme_classic() ``` one way anova test ```{r} anova_shannon <- aov(probability.of.shannon ~ Treatment, data = predicted.data.shannon) summary(anova_shannon) ``` - model for faith alpha diversity metric ```{r} regression_faiths <- glm(faith_pd ~ (Ecozone + Horizon + Treatment + Site + Compaction + `Moisture Content`), data = alpha_metrics) summary(regression_faiths) ``` Calculate the R^2 value ```{r} ll.null <- regression_faiths$null.deviance/-2 ll.proposed <- regression_faiths$deviance/-2 (ll.null - ll.proposed)/ll.null ``` plot the data ```{r} # create a new data frame object containing the predicted alpha diversity values and metadata categories predicted.data.faiths <- data.frame( probability.of.faith = predict(regression_faiths, type = "response"), Faith_pd = alpha_metrics$faith_pd, Ecozone = alpha_metrics$Ecozone, Horizon = alpha_metrics$Horizon, Treatment = alpha_metrics$Treatment, Site = alpha_metrics$Site, Compaction = alpha_metrics$Compaction ) # rearrange the LTSP treatments to order the plot predicted.data.faiths$Treatment <- factor(predicted.data.faiths$Treatment, levels = c("REF","OM1", "OM2", "OM3")) # plot Treatment vs the predicted values ggplot(predicted.data.faiths, aes(x= Treatment, y= probability.of.faith)) + geom_boxplot(color = "black",fill='#A4A4A4') + geom_jitter(position = position_jitter(0.2)) + scale_y_continuous(limits = c(14, 44)) + labs(x = "LTSP treatment", y = "Controlled Faiths Value") + theme_classic() ``` make an ANOVA comparison ```{r} anova_faiths <- aov(probability.of.faith ~ Treatment, data = predicted.data.faiths) summary(anova_faiths) ``` - regression model for pielou's evenness metric ```{r} regression_pielou <- glm(pielou_evenness ~ (Ecozone + Horizon + Treatment + Site + Compaction + `Moisture Content`), data = alpha_metrics) summary(regression_pielou) ``` Calculate R^2 value for model ```{r} ll.null <- regression_pielou$null.deviance/-2 ll.proposed <- regression_pielou$deviance/-2 (ll.null - ll.proposed)/ll.null ``` ```{r} predicted.data.pielou <- data.frame( probability.of.evenness = predict(regression_pielou, type = "response"), evenness = alpha_metrics$pielou_evenness, Ecozone = alpha_metrics$Ecozone, Horizon = alpha_metrics$Horizon, Treatment = alpha_metrics$Treatment, Site = alpha_metrics$Site, Compaction = alpha_metrics$Compaction ) # rearrange the LTSP treatments to order the plot predicted.data.pielou$Treatment <- factor(predicted.data.pielou$Treatment, levels = c("REF","OM1", "OM2", "OM3")) # plot Treatment vs the predicted values ggplot(predicted.data.pielou, aes(x= Treatment, y= probability.of.evenness)) + geom_boxplot(color = "black",fill='#A4A4A4') + geom_jitter(position = position_jitter(0.2)) + scale_y_continuous(limits = c(0.86, 0.96)) + labs(x = "LTSP treatment", y = "Controlled Evenness Value") + theme_classic() ``` one way anova analysis of predicted values ```{r} anova_pielou <- aov(probability.of.evenness ~ Treatment, data = predicted.data.pielou) summary(anova_pielou) ``` - regression model for observed_features ```{r} # calculate the regression model using glm() and then use summary() to see the coefficients regression_observed <- glm(observed_features ~ (Ecozone + Horizon + Treatment + Site + Compaction + `Moisture Content`), data = alpha_metrics) summary(regression_observed) ``` Calculate the R^2 value of the model ```{r} ll.null <- regression_observed$null.deviance/-2 ll.proposed <- regression_observed$deviance/-2 (ll.null - ll.proposed)/ll.null ``` Plot the data ```{r} predicted.data.observed <- data.frame( probability.of.features = predict(regression_observed, type = "response"), features = alpha_metrics$observed_features, Ecozone = alpha_metrics$Ecozone, Horizon = alpha_metrics$Horizon, Treatment = alpha_metrics$Treatment, Site = alpha_metrics$Site, Compaction = alpha_metrics$Compaction ) # rearrange the LTSP treatments to order the plot predicted.data.observed$Treatment <- factor(predicted.data.observed$Treatment, levels = c("REF","OM1", "OM2", "OM3")) # plot Treatment vs the predicted values ggplot(predicted.data.observed, aes(x= Treatment, y= probability.of.features)) + geom_boxplot(color = "black",fill='#A4A4A4') + geom_jitter(position = position_jitter(0.2)) + scale_y_continuous(limits = c(60, 220)) + labs(x = "LTSP treatment", y = "Controlled Oberseved Features Value") + theme_classic() ``` one way anova analysis of predicted observed features ```{r} anova_observed <- aov(probability.of.features ~ Treatment, data = predicted.data.observed) summary(anova_observed) ``` The following script is for an indicator taxa analysis to look at taxa in specific OM treatments that are not found in others **Load the packages needed** ```{r} library(dplyr) library(phyloseq) library(indicspecies) library(ape) library(ggplot2) library(forcats) library(tidyverse) library(broom) library(gridExtra) ``` **Function to group ASVs table by higher order taxonomy (By Ilan Rubin)** ```{r} group_by_taxonomy = function(asv_table, taxonomy, rank){ asv_table = as.data.frame(asv_table) taxonomy = as.data.frame(taxonomy) taxonomy$ASV = rownames(taxonomy) asv_table$ASV = rownames(asv_table) asv_table = inner_join(taxonomy,asv_table,by="ASV") asv_table$taxa = apply(asv_table[,1:rank],1,paste,collapse=" ") asv_table = asv_table[,-(1:8)] asv_table = group_by(data.frame(asv_table),taxa) taxa_table = as.data.frame(summarise_all(asv_table,sum)) rownames(taxa_table) = taxa_table$taxa return(taxa_table[,-1]) } ``` **Load the data** This step assumes you have run the 'qiime export' commands previously and have the downloaded files. Refer to the QIIME2 working script if you need to see this commands. ```{r} biom = import_biom("exported/table-with-taxonomy.biom") metadata <- import_qiime_sample_data("soil_metadata.txt") tree <- read_tree_greengenes("exported/tree.nwk") tree <- multi2di(tree) # assuming you read in your tree.nwk file as the variable tree ``` **Create phyloseq object and filter it based on metadata** ```{r} physeq <- merge_phyloseq(biom, metadata, tree) # filter data on metadata filtered_phyloseq <- subset_samples(physeq, Region == "British Columbia") ``` **subset the data into tables needed for our analysis** ```{r} taxa_table <- otu_table(filtered_phyloseq) taxonomy <- tax_table(filtered_phyloseq) metadata <- sample_data(filtered_phyloseq) ``` **Create the taxa table for the rank of taxonomy required** Kingdom=1, Phylum=2, Class=3, Order=4, Family=5, Genus=6, Species=7 ```{r} taxa_table = group_by_taxonomy(taxa_table, taxonomy, 3) ``` ### Calculate indicator values ```{r} indicator_multipatt = multipatt(t(taxa_table),metadata$LTSP.Treatment,duleg=T) summary(indicator_multipatt) indicator_output = capture.output(summary(indicator_multipatt,indvalcomp=TRUE)) write.table(indicator_output,file="indiciator_values.txt",row.names=F,quote=F) ``` # taxonomic analysis of selected taxonomic classes **Import the taxonomic species data from taxonomy-bar plot** ```{r} species_data <- read_csv("level-7.csv") head(species_data) ``` **import function that determines relative abundance analysis at a certain taxonomic grouping** ```{r} source("RelativeTaxa.R") ``` **make the relative abundance analysis of the classified ASVs for each sample** Data is grouped and the family taxonomic level (taxa_level = 3) ```{r} dat <- RelativeData(species_data, merge = TRUE, taxa_level = 3, Metadata = c(451:length(species_data))) # merge means that it will group ASVs at a higher taxonomy order # taxa_level refers to the taxonomic leve. In this case it is "Class" # metadata is just the columns in the metadata should not be taking into consideration in the analysis ``` **find the metadata columns** ```{r} metadata_names <- colnames(species_data)[451: length(species_data)] metadata_names ``` **Write a vector with the classes from the indicator taxa analysis that were significant and that have an adequate classification** ```{r} significant_results <- c("c__Alphaproteobacteria", "c__Ellin6529", "c__Gemmatimonadetes", "c__Bacilli", "c__P2-11E", "c__Chloroflexi", "c__Betaproteobacteria", "c__iii1-8", "c__Nitrospira", "c__TK10", "c__[Chloracidobacteria]", "c__Gitt-GS-136", "c__Ktedonobacteria", "c__Solibacteres", "c__Gammaproteobacteria", "c__Acidobacteriia", "c__Actinobacteria", "c__Sphingobacteriia", "c__Cytophagia", "c__Opitutae", "c__Chthonomonadetes", "c__Armatimonadia", "c__Verrucomicrobiae") ``` **now we need to run our significant results through a logistic regression model to account for other confounding variables and determine in which models OM treatments are significant in explaining variation in abundance ** this step just renames some categorical variables to make it easier ```{r} # rename some of the categorical variables to make them easier to use dat <- dat %>% rename(Date = `Collection Date`, Treatment = `LTSP Treatment`, Annual_temperature = `Mean Annual Temperature (Celsius)`, Soil_classification = `Soil Classification`, Compaction = `Compaction Treatment`) ``` **In this step we create a "for" loop that stablishes which of the taxonomic classes from our indicator taxa analysis is significanly explained by OM removal treatment when controlling for other confounding variables** ```{r} # create data frame with corrected relative abundance values dat_corrected <- dat %>% select(index, Date, Treatment, Ecozone, Horizon, Treatment, Site, Compaction, `Moisture Content`) # Make a for loop that checks which of the taxonomic classes from the indicator taxa analysis # is significantly explained by OM removal treatment and stored the predicted values from the # logistic regression model in a dataframe filtered_significant_results <- vector(mode = "character") for (i in significant_results){ x <- dat[i] %>% unlist corrected_values <- predict(glm(x ~ Ecozone + Horizon + Treatment + Site + Compaction + `Moisture Content`, data = dat), type = "response") LR <- tidy(glm(x ~ Ecozone + Horizon + Treatment + Site + Compaction + `Moisture Content`, data = dat)) if (LR[4, 5] <= 0.05 | LR[5,5] <= 0.05 | LR[6,5] <= 0.05){ filtered_significant_results <- append(filtered_significant_results, i) dat_corrected[i] <- corrected_values # adds the corrected values to our data frame } } dat_corrected ``` Create boxplots of a subset of significant classes ```{r} # assign order in which boxplots will be plotted dat_corrected$Treatment <- factor(dat$Treatment, levels = c("REF", "OM1", "OM2", "OM3")) a1 <- ggplot(dat_corrected, aes(x= Treatment, y = c__Alphaproteobacteria, fill = Treatment)) + geom_boxplot() + labs(x = "OM removal treatment", y = "Predicted Relative Abundance", title = str_sub("c__Alphaproteobacteria",4 )) + theme_classic() + scale_fill_npg() + scale_y_continuous(labels = scales::percent) a2 <- ggplot(dat_corrected, aes(x= Treatment, y = c__Acidobacteriia, fill = Treatment)) + geom_boxplot() + labs(x = "OM removal treatment", y = "Predicted Relative Abundance", title = str_sub("c__Acidobacteriia",4 )) + theme_classic() + scale_fill_npg() + scale_y_continuous(labels = scales::percent) a3 <- ggplot(dat_corrected, aes(x= Treatment, y = c__Bacilli, fill = Treatment)) + geom_boxplot() + labs(x = "OM removal treatment", y = "Predicted Relative Abundance", title = str_sub("c__Bacilli",4 )) + theme_classic() + scale_fill_npg() + scale_y_continuous(labels = scales::percent) a4 <- ggplot(dat_corrected, aes(x= Treatment, y = c__Gammaproteobacteria, fill = Treatment)) + geom_boxplot() + labs(x = "OM removal treatment", y = "Predicted Relative Abundance", title = str_sub("c__Gammaproteobacteria",4 )) + theme_classic() + scale_fill_npg() + scale_y_continuous(labels = scales::percent) a5 <- ggplot(dat_corrected, aes(x= Treatment, y = c__Actinobacteria, fill = Treatment)) + geom_boxplot() + labs(x = "OM removal treatment", y = "Predicted Relative Abundance", title = str_sub("c__Actinobacteria",4 )) + theme_classic() + scale_fill_npg() + scale_y_continuous(labels = scales::percent) a6 <- ggplot(dat_corrected, aes(x= Treatment, y = c__Ellin6529, fill = Treatment)) + geom_boxplot() + labs(x = "OM removal treatment", y = "Predicted Relative Abundance", title = str_sub("c__Ellin6529",4 )) + theme_classic() + scale_fill_npg() + scale_y_continuous(labels = scales::percent) ``` ```{r} # arrange the plos of the relevant taxas grid.arrange(a1, a2, a3, a4, a5, a6, ncol = 2) ``` # Plot the remaining significant classes ```{r} x <- gather(dat_corrected, key = "Taxa", value = "Count", filtered_significant_results) %>% mutate(Taxa = as.factor(Taxa)) %>% ggplot(aes(x= Treatment, y = Count, fill = Treatment)) + geom_boxplot() + labs(x = "OM removal treatment", y = "Predicted Relative Abundance") + theme_classic() + scale_fill_npg() + scale_y_continuous(labels = scales::percent) + facet_wrap(. ~ Taxa, nrow = 5, ncol = 3, scales = "free") ``` # Species level classification of 16S experiment ## Start with loading all necessary packages: ```{r} library(dplyr) library(tidyr) library(ggplot2) library(forcats) library(tidyverse) library(RColorBrewer) library(ggsci) ``` ## Import the species data and take a look at it ```{r} species_data <- read_csv("level-7.csv") head(species_data) ``` # find the metadata columns ```{r} metadata_names <- colnames(species_data)[451: length(species_data)] metadata_names metadata <- species_data[,451:length(species_data)] ``` # import function to do the relative abundance analysis ```{r} source("RelativeTaxa.R") ``` # make the relative abundance analysis of the classified ASVs for each sample Data is grouped and the family taxonomic level (taxa_level = 3) ```{r} dat <- RelativeData(species_data, merge = TRUE, taxa_level = 3, Metadata = c(451:length(species_data))) ``` # plot the stacked bar plots Gather the data in a readable format for ggplot2 ```{r} # gether the data in a format that is readable for ggplot2 dat_gather <- dat %>% gather(key = "Class", value = "Count", -c(1, metadata_names)) %>% data.frame ``` Ploting formatting variables used to plot taxonomic bar plot ```{r} theme1 <- theme(legend.title = element_text(size = 10), legend.text = element_text(size = 5), axis.text.x = element_text(angle = 45, hjust = 1), legend.box = "vertical") guide1 <- guides(shape = guide_legend(override.aes = list(size = 1)), color = guide_legend(override.aes = list(size = 0.5)), fill = guide_legend(ncol = 2)) geom_stacked <- stat_summary(geom = "bar", fun.data = mean_se, position = "fill") ``` ```{r} # Plot the composition in relationship to OM treatment and faceted based on soil horizon dat_gather$LTSP.Treatment <- factor(dat_gather$LTSP.Treatment, levels = c("REF", "OM1", "OM2", "OM3")) OM_plot <- dat_gather %>% filter(Count >= 0.01) %>% #filter(Species != "Unassigned") %>% ggplot(aes(x = LTSP.Treatment, y = Count, fill = Class)) + geom_stacked + facet_grid( ~ Horizon, scales = "free_x" ) + # Change legend size and the x axis labels to be vertical: theme1 + guide1 + labs(x = "OM removal Treatment", y= "Relative abundance") OM_plot ``` ## function for cleaning up taxa_barplot .csv file from qiime # Load important packages library(tidyverse) library(dplyr) library(forcats) library(tidyr) choose_indexes <- function(dat, type = "numeric"){ indexes <- c() n_row <- nrow(dat) # number of rows n_col <- ncol(dat) # number of columns for (i in 1:n_col){ if (class(unlist(dat[i])) == type){ indexes <- append(indexes, i) } } return(indexes) } extract_taxa <- function(name, level = 7, nearest_taxa = FALSE) { split_data <- strsplit(name, ";") %>% unlist # splits the name based into a vector if (length(split_data) == 1){ return(name)} element <- split_data[level] # extract last element of the vector which should contain the species classification if (nearest_taxa == TRUE){ while (level > 0){ if (nchar(element) == 3 & grepl(".__", element) == TRUE | element == "__"){ level <- level - 1 element <- split_data[level] next} else { return(element)} } } else if (nearest_taxa == FALSE){ if (nchar(element) == 3 & grepl(".__", element) == TRUE |element == "__"){ return("Unassigned")} else { return(element)} } return("Unassigned") # if nothing was able to classify! } RelativeData <- function(dat, merge = TRUE, taxa_level = 7, nearest_taxa = FALSE, Metadata = c(), TaxaCutoff = 0.005, SampleFilter = FALSE, SampleCutoff = 0,Pseudocount = FALSE){ ####### addd a remove paramenter to take care of metadata ##################### FACTORS ######################################################################### # Change any character column to factors --> easier to work with and more functionality character_indexes <- choose_indexes(dat, type = "character") dat[, character_indexes] <- lapply(dat[, character_indexes], as.factor) # change all columns with character value to factors ##################### Change the column names to the chosen taxa ####################################### new_column_names <- sapply(colnames(dat), extract_taxa, taxa_level, nearest_taxa) names(dat) <- new_column_names #################### separate the metadata from the data for further analysis ######################### if (length(Metadata) > 0){ metadata <- dat[, Metadata] if (class(Metadata) == "numeric" | class(Metadata) == "integer"){ dat <- dat[, -Metadata] } else if (class(Metadata) == "character" | class(Metadata) == "factor") { dat <- dat[, !names(dat) %in% Metadata] } } ##################### Merge the columns with same name ################################################# # Merge columns with the samen name together if merge = TRUE #collapse columns that have the same name (sum them together) if (merge == TRUE){ col_n <- ncol(dat) character_indexes <- choose_indexes(dat, type = "factor") dat2 <- t(dat[, -character_indexes]) aggr <- by(dat2, INDICES= row.names(dat2), FUN= colSums) aggr <- as.data.frame(do.call(cbind,aggr)) dat <- cbind(dat[, character_indexes[1]], aggr, dat[, character_indexes[-1]]) } ##################### Find the relative counts ########################################################## # find the variable indexes that are numeric numeric_indexes <- choose_indexes(dat) # calculate total counts and set it as a data frame total_counts <- as.data.frame(rowSums(dat[, numeric_indexes]), nm = "total_counts") # bind the new data frame to the existing one and filter samples if the user selects so! if (SampleFilter == TRUE){ dat <- dat %>% bind_cols(total_counts) %>% filter(!total_counts <= SampleCutoff)} else{ dat<- bind_cols(dat, total_counts)} # change 0 values to pseudocount if Pseudocount variable is TRUE if (Pseudocount == TRUE){ dat <- dat %>% mutate_if(is.numeric, ~if_else(. <= 0.000000, 0.0000001, .)) } # obtain indexes of only numeric variables numeric_indexes <- choose_indexes(dat) # subset data frame to only include the numeric variables dat2 <- dat[, numeric_indexes] # Find the relative proportion of each sample based on its total_counts n_row <- nrow(dat2) # number of rows n_col <- ncol(dat2) # number of columns for (r in 1:n_row){ for (c in 1:n_col) { dat2[r, c] <- dat2[r, c]/dat2[r, n_col] # this calculates the proportion } } #merge this data again into original data frame character_indexes <- choose_indexes(dat, "factor") dat <- bind_cols(dat[character_indexes[1]], dat2, dat[character_indexes[-1]]) dat["total_counts"] <- NULL ##################### Columns with chosen counts############################################################# # take out any colums which have 0 counts for every row column_sums <- c() col_n <- ncol(dat) for (i in 1:col_n){ if (class(unlist(dat[i])) == "numeric"){ if (colSums(dat[i]) <= TaxaCutoff){ column_sums <- append(column_sums, i) } } } dat[ column_sums] <- NULL ###################### Bind the metadata again to the processed data ############################### #character_indexes <- choose_indexes(metadata, type = "character") #metadata <- lapply(metadata[, character_indexes], as.factor) # change all metadata columns to factors if character if (length(Metadata) > 0) { dat <- bind_cols(dat, metadata) } return(as.data.frame(dat)) }