#ALL ANALYSES #Start working on the server ssh root@10.19.139.153 #Move into the data directory cd /data #Make directories for the research aims mkdir Research_Aim_1 mkdir Research_Aim_2 mkdir Research_Aim_3A mkdir Research_Aim_3B #Import the data that has already been demultiplexed qiime tools import \ --type 'SampleData[SequencesWithQuality]' \ --input-path /mnt/datasets/project_2/tanzania/tanzania_manifest.txt \ --output-path demux.qza \ --input-format SingleEndFastqManifestPhred33V2 #Convert demux.qza to demuz.qzv to visualize the data and determine an appropriate sequence length using the interactive quality plot qiime demux summarize --i-data demux.qza --o-visualization demux.qzv #Use stats-dada2.qzv to determine if we chose a length that includes a large portion of the reads of interest after sequences quality control using DADA2 qiime dada2 denoise-single \ --i-demultiplexed-seqs /data/demux.qza \ --p-trim-left 6 \ --p-trunc-len 150 \ --o-representative-sequences rep-seqs-dada2_150.qza \ --o-table table-dada2.qza \ --o-denoising-stats stats-dada2.qza qiime metadata tabulate \ --m-input-file stats-dada2.qza \ --o-visualization stats-dada2.qzv #p-trunc-len 150 seems most appropriate for the data, and will therefore be used moving forward #Create directory for denoised data and insert rep-seqs-dada2_150.qza and table-dada2.qza mkdir denoise_150 mv rep-seqs-dada2_150.qza denoise_150 mv table-dada2.qza denoise_150 #Copied the metadata.txt file from the mnt directory to the data directory using cd /mnt/datasets/project_2/tanzania/ #and cp tanzania_metadata.txt /data/ #Creating summaries of feature table and feature data using table.qza and rep-seqs.qza from sequences truncated at 150 nt #Change directory to denoise_150 where table.qza and rep-seqs.qza are located cd /data/denoise_150 #Create a .qzv file for table.qza qiime feature-table summarize \ --i-table table-dada2.qza \ --o-visualization table.qzv \ --m-sample-metadata-file /data/tanzania_metadata.txt #Create a visualization of rep-seqs.qza qiime feature-table tabulate-seqs \ --i-data rep-seqs-dada2_150.qza \ --o-visualization rep-seqs_150.qzv # Metadata-base filtering for longitudinal analysis # Change working directory to Reasearch_Aim_1 # Filter for only the human fecal samples using table.qza in order to keep only the sample of interest. qiime feature-table filter-samples \ --i-table /data/denoise_150/table-dada2.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --p-where "[sample_type]='feces' AND [host_common_name]='human'" \ --o-filtered-table human_feces-table.qza # Create 2 data subsets for each host life stage: child and adult. qiime feature-table filter-samples \ --i-table human_feces-table.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --p-where "[host_life_stage]='child'" \ --o-filtered-table child_feces-table.qza qiime feature-table filter-samples \ --i-table human_feces-table.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --p-where "[host_life_stage]='adult'" \ --o-filtered-table adult_feces-table.qza # Convert qza file to qzv to visualize the filtered table qiime feature-table summarize \ --i-table child_feces-table.qza \ --o-visualization child_feces-table.qzv \ --m-sample-metadata-file /data/tanzania_metadata.txt qiime feature-table summarize \ --i-table adult_feces-table.qza \ --o-visualization adult_feces-table.qzv \ --m-sample-metadata-file /data/tanzania_metadata.txt # filter metadata in R data <- read_csv("tanzania_metadata.csv") data <- data %>% filter(data$host_life_stage %in% c("child", "adult")) data$unique_name_type_season <- gsub("\\_.*", "", data$unique_name_type_season) data$X1 <- NULL data$season[data$season %in% c("2014-EW", "2014-LW")] <- "Wet14" data$season[data$season %in% c("2013-ED", "2013-LD")] <- "Dry13" data$season[data$season %in% c("2014-ED", "2014-LD")] <- "Dry14" data <- data %>% filter(bush_camp %in% c("HUKAMAKO", "SENGELI")) write.table(data, file='tanzania_metadata_season_camp_noinfant.tsv', quote=FALSE, sep='\t', row.names=FALSE) #Filer medadata for all human fecal samples in order to conduct beta diversity analysis: qiime feature-table filter-samples \ --i-table /data/denoise_150/table-dada2.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --p-where "[sample_type]='feces' AND [host_common_name]='human'" \ --o-filtered-table human-fecal-3A-table.qza #Visualize metadata by making a qzv file: qiime feature-table summarize \ --i-table human-fecal-3A-table.qza \ --o-visualization human-fecal-3A-table.qzv \ --m-sample-metadata-file /data/tanzania_metadata.txt #Metadata-based filtering to retain fecal samples from 2014 ED season #Change working directory to Research_Aim_2 cd /data/Research_Aim_2 #First filtering step: retain only those samples which were collected during 2014 ED season and which have a documented water source #For water source, exclude 'not collected' qiime feature-table filter-samples \ --i-table /data/denoise_150/table-dada2.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --p-where "[season]='2014-ED' AND NOT [water_source]='not_collected'" \ --o-filtered-table 2014_ED_filtered-table.qza #Second filtering step: within 2014_ED_filtered-table.qza, exclude those samples whose water source is 'not applicable' qiime feature-table filter-samples \ --i-table 2014_ED_filtered-table.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --p-where "[season]='2014-ED' AND NOT [water_source]='not_applicable'" \ --o-filtered-table 2014_ED_water_documented_filtered-table.qza #Visualize 2014_ED_water_documented_filtered-table.qza file qiime feature-table summarize \ --i-table 2014_ED_water_documented_filtered-table.qza \ --o-visualization 2014_ED_water_documented_filtered-table.qzv \ --m-sample-metadata-file /data/tanzania_metadata.txt #Metadata-based filtering to retain all skin swab samples #Move into the Research_Aim_3B directory to begin working with the right hand data cd Research_Aim_3B #Filter out all samples from the metadata that are not human right hand samples qiime feature-table filter-samples \ --i-table /data/denoise_150/table-dada2.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --p-where "[sample_type_merged] ='HUMAN_R_HAND'" \ --o-filtered-table right-hand-filtered-table.qza #Visualize the file to confirm that only right hand samples were kept qiime feature-table summarize \ --i-table right-hand-filtered-table.qza \ --o-visualization right-hand-filtered-table.qzv \ --m-sample-metadata-file /data/tanzania_metadata.txt #Phylogenetic analysis #Generate a tree for phylogenetic diversity analyses #Store these in the common directory cd /data/ #Create unrooted and rooted trees qiime phylogeny align-to-tree-mafft-fasttree \ --i-sequences /data/denoise_150/rep-seqs-dada2_150.qza \ --o-alignment aligned-rep-seqs.qza \ --o-masked-alignment masked-aligned-rep-seqs.qza \ --o-tree unrooted-tree.qza \ --o-rooted-tree rooted-tree.qza #Alpha rarefaction for longitudinal analysis qiime diversity alpha-rarefaction \ --i-table human_feces-table.qza \ --i-phylogeny /data/rooted-tree.qza \ --p-max-depth 20000 \ --m-metadata-file /data/tanzania_metadata.txt \ --o-visualization human_feces-alpha-rarefaction.qzv # create a visualization of human-faces feature table to observe the number of lost samples qiime feature-table summarize \ --i-table human_feces-table.qza \ --o-visualization human_feces-table.qzv \ --m-sample-metadata-file /data/tanzania_metadata.txt #Alpha rarefaction for all fecal samples #Create alpha rarefaction plot using filtered table qiime diversity alpha-rarefaction \ --i-table human-fecal-3A-table.qza \ --i-phylogeny /data/rooted-tree.qza \ --p-max-depth 20000 \ --m-metadata-file /data/tanzania_metadata.txt \ --o-visualization alpha-rarefaction-3A.qzv #Alpha rarefaction for fecal samples from 2014 ED season #change working directory to Research_Aim_2 cd Research_Aim_2/ #create alpha rarefaction visualization qiime diversity alpha-rarefaction \ --i-table 2014_ED_water_documented_filtered-table.qza \ --i-phylogeny /data/rooted-tree.qza \ --p-max-depth 20000 \ --m-metadata-file /data/tanzania_metadata.txt \ --o-visualization alpha-rarefaction_20000.qzv #Alpha rarefaction for all skin swab samples #Alpha rarefaction curves now need to be plotted so that an appropriate sequence depth can be chosen for the right hand samples #This will make it so that any samples with an insufficient amount of reads are not included in the beta diversity analyses qiime diversity alpha-rarefaction \ --i-table right-hand-filtered-table.qza \ --i-phylogeny /data/rooted-tree.qza \ --p-max-depth 20000 \ --m-metadata-file /data/tanzania_metadata.txt \ --o-visualization right-hand-alpha-rarefaction.qzv #The number of samples discarded at a specific sampling depth can then be visualized using the interactive sample detail of right-hand-filtered-table.qzv #Seasonal pattern of adult and child fecal samples from 2 bush camps # Analysis using filtered metadata and visualize PCoA plot for seasonal pattern in R # filter feature table according to filtered metadata qiime feature-table filter-samples \ --i-table /data/Research_Aim_1/human_feces-table.qza \ --m-metadata-file /data/Research_Aim_1/tanzania_metadata_season_camp_noinfant.tsv \ --o-filtered-table filtered-table.qza # calculate diversity metrics again with the filtered table qiime diversity core-metrics-phylogenetic \ --i-phylogeny /data/rooted-tree.qza \ --i-table filtered-table.qza \ --p-sampling-depth 6500 \ --m-metadata-file /data/Research_Aim_1/tanzania_metadata_season_camp_noinfant.tsv \ --output-dir core-metrics-results_filtered # recalculate the significance between seasons qiime diversity beta-group-significance \ --i-distance-matrix /data/Research_Aim_1/core-metrics-results_filtered/unweighted_unifrac_distance_matrix.qza \ --m-metadata-file /data/Research_Aim_1/tanzania_metadata_season_camp_noinfant.tsv \ --m-metadata-column season \ --o-visualization core-metrics-results_filtered/unweighted-unifrac-season-significance.qzv \ --p-pairwise # visualizing it in R # Loading packages library(tidyverse) library(vegan) library(phyloseq) library(DESeq2) library(ape) # Importing all data files biom_file <- import_biom("exported/table-with-taxonomy.biom") metadata <- import_qiime_sample_data("tanzania_metadata_season_camp_noinfant.tsv") tree <- read_tree_greengenes("exported/tree.nwk") tree <- multi2di(tree) # Combine all objects into phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) # Set set of random numbers set.seed(711) # Rename column names for taxonomic ranks colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species") # Filter data based on metadata category with == feces <- subset_samples(physeq, sample_type == "feces" & host_common_name == "human") # Exclude samples with less than 6500 reads feces_6500 <- prune_samples(sample_sums(feces) >= 6500, feces) # Beta diversity PCoA plot physeq_rar <- rarefy_even_depth(feces_6500, sample.size = 6500) # Convert to RA (relative abundance) physeq_rar_RA <- transform_sample_counts(physeq_rar, function(x) x/sum(x)) ord <- ordinate(physeq_rar_RA, method = "PCoA", distance = "unifrac", weighted = FALSE) # Plot data plot_ordination(physeq_rar_RA, ord, type = "samples", color = "season", title = "PCoA (Unweighted Unifrac)") + scale_colour_manual(values = c("Dry13" = "#98D391", "Wet" = "#A480DD", "Dry14" = "#589152"), labels = c("2013 Dry", "2014 Dry", "2014 Wet")) + stat_ellipse(type = "norm", size = 1, linetype = 2) + guides(colour = guide_legend("Seasons")) + theme_bw(base_size = 14) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + xlab("PCo1 (16.9%)") + ylab("PCo2 (7.4%)") #Beta diversity analysis for all fecal samples #Move into Research Aim 3A folder cd /data/Research_Aim_3A #Create alpha and beta diversity core metrics qiime diversity core-metrics-phylogenetic \ --i-phylogeny /data/rooted-tree.qza \ --i-table human-fecal-3A-table.qza \ --p-sampling-depth 6000 \ --m-metadata-file /data/tanzania_metadata.txt \ --output-dir human-fecal-3A-core-metrics-results #Create Beta diversity boxplots with PERMANOVA values qiime diversity beta-group-significance \ --i-distance-matrix human-fecal-3A-core-metrics-results/unweighted_unifrac_distance_matrix.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column sex \ --o-visualization human-fecal-3A-core-metrics-results/unweighted-unifrac-sex-significance.qzv \ --p-pairwise qiime diversity beta-group-significance \ --i-distance-matrix human-fecal-3A-core-metrics-results/weighted_unifrac_distance_matrix.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column sex \ --o-visualization human-fecal-3A-core-metrics-results/weighted-unifrac-sex-significance.qzv \ --p-pairwise qiime diversity beta-group-significance \ --i-distance-matrix human-fecal-3A-core-metrics-results/bray_curtis_distance_matrix.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column sex \ --o-visualization human-fecal-3A-core-metrics-results/bray_curtis_sex_significance.qzv \ --p-pairwise qiime diversity beta-group-significance \ --i-distance-matrix human-fecal-3A-core-metrics-results/jaccard_distance_matrix.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column sex \ --o-visualization human-fecal-3A-core-metrics-results/jaccard_sex_significance.qzv \ --p-pairwise #Recreating PCoA plot of weighted UniFrac for all fecal samples in R #Recreating graphs in R #Move into Research Aim 3A directory #Export the data qiime tools export \ --input-path human-fecal-3A-table.qza \ --output-path exported qiime tools export \ --input-path /data/taxonomy.qza \ --output-path exported qiime tools export \ --input-path /data/rooted-tree.qza \ --output-path exported #Use a text editor to change column names nano exported/taxonomy.tsv #Combine BIOM data with taxonomy biom add-metadata \ -i exported/feature-table.biom \ -o exported/table-with-taxonomy.biom \ --observation-metadata-fp exported/taxonomy.tsv \ --sc-separated taxonomy #Move all files to local computer and place in the R directory you will be using #Follow R Script adapted from Stephan Koenig # Loading packages library(tidyverse) library(vegan) library(phyloseq) library(DESeq2) BiocManager::install("latticeExtra") # Importing all data files biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("tanzania_metadata.txt") tree <- read_tree_greengenes("tree.nwk") library(ape) tree <- multi2di(tree) # Combine all objects into phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) # Overview of phyloseq object physeq # Use sample_data() to look at metadata of phyloseq object # Look only at first 6 lines of table with head() head(sample_data(physeq)) # Set set of random numbers set.seed(711) # taxonomic rank names rank_names(physeq) # Rename column names for taxonomic ranks colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species") rank_names(physeq) # format data as table with as.data.frame() as.data.frame(sample_names(physeq)) # Show read count for each sample using sample_sums() as.data.frame(sample_sums(physeq)) # Beta diversity PCoA plot # Diversity requires rarefied taxa tables # We know from Qiime that we should rarefy to so right palm # samples level out physeq_rar <- rarefy_even_depth(physeq, sample.size = 6000) # Convert to RA (relative abundance) physeq_rar_RA <- transform_sample_counts(physeq_rar, function(x) x/sum(x)) # We define the type of analysis with ordinate() and setting the # method = PCoA, setting distance to type of beta diversity analysis # you can change analysis to jaccard, bray-curtis and so on ord <- ordinate(physeq_rar_RA, method = "PCoA", distance = "wunifrac") # Plot data with labels and colour plot_ordination(physeq_rar_RA, ord, type = "sample_type_merged", color = "sex", title = "PCoA (Weighted Unifrac)") + annotate(geom = "text", label = "Fecal Samples", x= -0.07, y= 0.06) + theme_bw()+ theme(legend.text = element_text(size=10))+ guides(shape= guide_legend(override.aes = list(size=4)))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) #Beta diversity analysis for fecal samples from 2014 ED season #change directory to Research_Aim_2 cd /data/Research_Aim_2/ #alpha and beta diversity analysis qiime diversity core-metrics-phylogenetic \ --i-phylogeny /data/rooted-tree.qza \ --i-table 2014_ED_water_documented_filtered-table.qza \ --p-sampling-depth 12000 \ --m-metadata-file /data/tanzania_metadata.txt \ --output-dir core-metrics-results #Beta diversity group significance according to bush camp qiime diversity beta-group-significance \ --i-distance-matrix core-metrics-results/weighted_unifrac_distance_matrix.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column bush_camp \ --o-visualization core-metrics-results/weighted-unifrac-bushcamp-group-significance.qzv \ --p-pairwise #Recreate weighted UniFrac plot for fecal samples from 2014 ED season in R #Exporting biom data from qiime #Exporting filtered table.qza qiime tools export \ --input-path 2014_ED_water_documented_filtered-table.qza \ --output-path exported #Exporting taxonomy.qza qiime tools export \ --input-path /data/taxonomy.qza \ --output-path exported #Exporting rooted-tree.qza qiime tools export \ --input-path /data/rooted-tree.qza \ --output-path exported #Open file in a text editor #Combine taxonomy with BIOM data biom add-metadata \ -i exported/feature-table.biom \ -o exported/table-with-taxonomy.biom \ --observation-metadata-fp exported/taxonomy.tsv \ --sc-separated taxonomy #Import table-with-taxonomy.biom and tree.nwk to local computer scp root@10.19.139.153:/data/Research_Aim_2/exported/table-with-taxonomy.biom . scp root@10.19.139.153:/data/Research_Aim_2/exported/tree.nwk . scp root@10.19.139.153:/data/tanzania_metadata.txt . #R script for PCoA plot and taxa bar plot #Trying to make a pretty PCoA plot #Loading packages library(tidyverse) library(vegan) library(phyloseq) library(DESeq2) # Helper functions gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x)) } # Importing all data files biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("tanzania_metadata.txt") tree <- read_tree_greengenes("tree.nwk") library(ape) tree<-multi2di(tree) # Combine all objects into phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) #Overview of phyloseq object physeq # Set set of random numbers set.seed(711) # taxonomic rank names rank_names(physeq) # Rename column names for taxonomic ranks colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species") rank_names(physeq) # Use sample_data() to look at metadata of phyloseq object # Look only at first 6 lines of table with head() sample_data(physeq) #Filter out all water samples no_water<-subset_samples(physeq, sample_type_merged != "ENVIRONMENTAL_WATER") sample_data(no_water) # format data as table with as.data.frame() as.data.frame(sample_names(no_water)) # Show read count for each sample using sample_sums() as.data.frame(sample_sums(no_water)) # Beta diversity PCoA plot # Diversity requires rarefied taxa tables # We know from Qiime that we should rarefy to 12000 physeq_rar <- rarefy_even_depth(no_water, sample.size = 12000) # Convert to RA (relative abundance) physeq_rar_RA <- transform_sample_counts(physeq_rar, function(x) x/sum(x)) # We define the type of analysis with ordinate() and setting the # method = PCoA, setting distance to type of beta diversity analysis # you can change analysis to jaccard, bray-curtis and so on ord <- ordinate(physeq_rar_RA, method = "PCoA", distance = "wunifrac") # Plot data plot_ordination(physeq_rar_RA, ord, type = "sample", color = "bush_camp", title = "PCoA (Weighted Unifrac)") + guides(colour = guide_legend("Bush Camp")) + # Adding text to plot annotate(geom = "text", label = "Fecal Samples 2014 ED", x = -0.2, y = -0.55, size = 4) + theme_bw() #Beta diversity analysis of all skin swab samples #Move into the Research_Aim_3B directory cd Research_Aim_3B #Perform diversity analysis on the right hand microbiome data #A sampling depth of 8070 will be used, as determined by performing alpha rarefaction qiime diversity core-metrics-phylogenetic \ --i-phylogeny /data/rooted-tree.qza \ --i-table right-hand-filtered-table.qza \ --p-sampling-depth 8070 \ --m-metadata-file /data/tanzania_metadata.txt \ --output-dir right-hand-core-metrics-results #Generate beta diversity box and whisker plots, along with PERMANOVA to compare male and female right hand samples. qiime diversity beta-group-significance \ --i-distance-matrix right-hand-core-metrics-results/unweighted_unifrac_distance_matrix.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column sex \ --o-visualization right-hand-core-metrics-results/unweighted-unifrac-sex-significance.qzv \ --p-pairwise qiime diversity beta-group-significance \ --i-distance-matrix right-hand-core-metrics-results/weighted_unifrac_distance_matrix.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column sex \ --o-visualization right-hand-core-metrics-results/weighted-unifrac-sex-significance.qzv \ --p-pairwise qiime diversity beta-group-significance \ --i-distance-matrix right-hand-core-metrics-results/bray_curtis_distance_matrix.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column sex \ --o-visualization right-hand-core-metrics-results/bray-curtis-sex-significance.qzv \ --p-pairwise qiime diversity beta-group-significance \ --i-distance-matrix right-hand-core-metrics-results/jaccard_distance_matrix.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column sex \ --o-visualization right-hand-core-metrics-results/jaccard-sex-significance.qzv \ --p-pairwise #Remake desired plot (Weighted Unifrac) in R for all skin swab samples #Export the data qiime tools export \ --input-path right-hand-filtered-table.qza \ --output-path exported qiime tools export \ --input-path /data/taxonomy.qza \ --output-path exported qiime tools export \ --input-path /data/rooted-tree.qza \ --output-path exported #Use a text editor to change column names nano exported/taxonomy.tsv #Change "Feature ID" to "#OTUID" #Change "Taxon" to "taxonomy" #Change "Condifence" to "confidence" #Combine the BIOM data with taxonomy biom add-metadata \ -i exported/feature-table.biom \ -o exported/table-with-taxonomy.biom \ --observation-metadata-fp exported/taxonomy.tsv \ --sc-separated taxonomy #Move all exported files to the local computer and open R # Loading packages library(tidyverse) library(vegan) library(phyloseq) library(DESeq2) # Helper functions gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x)) } # Import all data files biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("tanzania_metadata.txt") tree <- read_tree_greengenes("tree.nwk") library(ape) tree <- multi2di(tree) # Combine all objects into a phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) # Overview of phyloseq object physeq # Use sample_data() to look at metadata of phyloseq object # Look only at first 6 lines of table with head() head(sample_data(physeq)) # Create set of random numbers set.seed(711) # taxonomic rank names rank_names(physeq) # Rename column names for taxonomic ranks colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species") rank_names(physeq) # format data as table with as.data.frame() as.data.frame(sample_names(physeq)) # Show read count for each sample using sample_sums() as.data.frame(sample_sums(physeq)) # Create a beta diversity PCoA plot # Diversity requires rarefied taxa tables # We know from Qiime2 that we should rarefy to 8070 physeq_rar <- rarefy_even_depth(physeq, sample.size = 8070) # Convert to RA (relative abundance) physeq_rar_RA <- transform_sample_counts(physeq_rar, function(x) x/sum(x)) # Define the type of analysis with ordinate() and set the method = PCoA, with distance equating to the type of beta diversity analysis ord <- ordinate(physeq_rar_RA, method = "PCoA", distance = "wunifrac") # Plot data plot_ordination(physeq_rar_RA, ord, type = "sample_type_merged", color = "sex", title = "PCoA (Weighted Unifrac)") + # Add text to the plot and clean up the figure annotate(geom = "text", label = "Right Hand Samples", x = - 0.08, y = 0.07, size = 5)+ geom_point(aes(shape=bush_camp, size=2), show.legend=F)+ scale_shape_manual(name="bush camp", values=c(7,9,15,17,18,19))+ theme_bw()+ theme(legend.text = element_text(size=10))+ guides(shape= guide_legend(override.aes = list(size=4)))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank() #Training a taxonomic classifier #Train taxonomic classifier to perform taxonomic analysis #Download and unzip the Greengenes 99% database wget "ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz" gunzip gg_13_8_otus.tar.gz tar -xvf gg_13_8_otus.tar qiime tools import \ --type 'FeatureData[Sequence]' \ --input-path gg_13_8_otus/rep_set/99_otus.fasta \ --output-path ref-otus.qza qiime tools import \ --type 'FeatureData[Taxonomy]' \ --input-format HeaderlessTSVTaxonomyFormat \ --input-path gg_13_8_otus/taxonomy/99_otu_taxonomy.txt \ --output-path ref-taxonomy.qza #Convert database containg full-length 16S reference sequences to one that contains same length sequences from 16S regions sequenced in the Smits et al. paper #Smits et al. used 515F and 806R primers qiime feature-classifier extract-reads \ --i-sequences ref-otus.qza \ --p-f-primer GTGCCAGCMGCCGCGGTAA \ --p-r-primer GGACTACNVGGGTWTCTAAT \ --p-trunc-len 150 \ --o-reads ref-seqs.qza #Train the classifier using reference reads qiime feature-classifier fit-classifier-naive-bayes \ --i-reference-reads ref-seqs.qza \ --i-reference-taxonomy ref-taxonomy.qza \ --o-classifier classifier.qza #Use the newly trained classifier to conduct taxonomic analysis qiime feature-classifier classify-sklearn \ --i-classifier classifier.qza \ --i-reads denoise_150/rep-seqs-dada2_150.qza \ --o-classification taxonomy.qza qiime metadata tabulate \ --m-input-file taxonomy.qza \ --o-visualization taxonomy.qzv #Make taxa bar plot for water samples from 2014 ED season in R #Loading packages library(tidyverse) library(vegan) library(phyloseq) library(DESeq2) # Helper functions gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x)) } # Importing all data files biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("tanzania_metadata.txt") tree <- read_tree_greengenes("tree.nwk") library(ape) tree<-multi2di(tree) # Combine all objects into phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) #Overview of phyloseq object physeq # Set set of random numbers set.seed(711) # taxonomic rank names rank_names(physeq) # Rename column names for taxonomic ranks colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species") rank_names(physeq) # Use sample_data() to look at metadata of phyloseq object # Look only at first 6 lines of table with head() sample_data(physeq) #Trying to make a taxa bar plot physeq sample_data(physeq) #transform into relative abundances physeq2<-transform_sample_counts(physeq, function(x) x / sum(x) ) physeq2 #choose only fecal samples water<-subset_samples(physeq2, sample_type_merged == "ENVIRONMENTAL_WATER") water #create dataframe from phyloseq object wat<-psmelt(water) #turn all OTUs into family counts glom <- tax_glom(water, taxrank = "Family") glom # should list # taxa as # phyla data <- psmelt(glom) # create dataframe from phyloseq object data$Family <- as.character(data$Family) #convert to character #simple way to rename phyla with < 2.5% abundance data$Family[data$Abundance < 0.025] <- "< 2.5% abund." #Adding factor levels to env_feature and changing the values levels(data$env_feature) <- c(levels(data$env_feature), "Dry Riverbed", "Stream", "Well") data$env_feature[data$env_feature == "dry river"] <- "Dry Riverbed" data$env_feature[data$env_feature == "stream"] <- "Stream" data$env_feature[data$env_feature == "water well"] <- "Well" #plot with condensed phyla into "< 2.5% abund" category p <- ggplot(data=data, aes(x=env_feature, y=Abundance, fill=Family))+ geom_bar(aes(), stat="identity", position="fill") + xlab("Water Source") + ylab("Relative Abundance") + scale_fill_manual(values = c("darkblue", "darkgoldenrod1", "darkseagreen", "darkorchid", "darkolivegreen1", "lightskyblue", "darkgreen", "deeppink", "khaki2", "firebrick", "brown1", "darkorange1", "cyan1")) + theme(legend.position = "right") #Differential abundance analysis for all skin swab samples between males and females #Filter out low frequency ASVs prior to differential abundance testing. #Total number of reads= 2132568. #2132568 x 0.00005 = 106.6, therefore we will set the p-min frequency to 107. qiime feature-table filter-features \ --i-table right-hand-filtered-table.qza \ --p-min-frequency 107 \ --o-filtered-table right-hand-filtered-table-2.qza #Filter out non-bacterial sequences prior to differential abundance testing. qiime taxa filter-table \ --i-table right-hand-filtered-table-2.qza \ --i-taxonomy /data/taxonomy.qza \ --p-exclude mitochondia,chloroplast,archaea \ --o-filtered-table right-hand-filtered-table-3.qza #Perform differential abundance testing using ANCOM to find features that are differentially abundant between males and females. qiime composition add-pseudocount \ --i-table right-hand-filtered-table-3.qza \ --o-composition-table comp-right-hand-filtered-table.qza qiime composition ancom \ --i-table comp-right-hand-filtered-table.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column sex \ --o-visualization ancom-right-hand-sex.qzv #Perform the analysis at a specific taxonomic level (at the genus level here). qiime taxa collapse \ --i-table right-hand-filtered-table-3.qza \ --i-taxonomy /data/taxonomy.qza \ --p-level 6 \ --o-collapsed-table right-hand-table-lvl6.qza qiime composition add-pseudocount \ --i-table right-hand-table-lvl6.qza \ --o-composition-table comp-right-hand-table-lvl6.qza qiime composition ancom \ --i-table comp-right-hand-table-lvl6.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column sex \ --o-visualization ancom-right-hand-sex-lvl6.qzv #Differential abundance analysis of fecal samples from 2014 ED season according to bush camp #Differential abundance analysis for human fecal samples according to bush camps #Need more than one fecal sample per bush camp #need to filter out environmental samples first qiime feature-table filter-samples \ --i-table 2014_ED_water_documented_filtered-table.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --p-where "[sample_type_merged]='HUMAN_FECES'" \ --o-filtered-table 2014_ED_feces_documented_water_table.qza #Filter out Hukamako sample since there's only one qiime feature-table filter-samples \ --i-table 2014_ED_feces_documented_water_table.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --p-where "[bush_camp] IN ('BENJA', 'ENDAMANGA_SCHOOL', 'EZEKILI', 'GIDAMILANDA', 'MSAFIRI', 'MWAMUDU', 'SAITOTI', 'SENGELI')" \ --o-filtered-table bush_camp_filtered_no_hukamako_table.qza #filter out low frequency ASVs qiime feature-table filter-features \ --i-table bush_camp_filtered_no_hukamako_table.qza \ --p-min-frequency 2 \ --o-filtered-table bushcamp_nohukamako_frequency-filtered-table.qza #filter out non-bacterial sequences qiime taxa filter-table \ --i-table bushcamp_nohukamako_frequency-filtered-table.qza \ --i-taxonomy /data/taxonomy.qza \ --p-exclude mitochondria,chloroplast,archea \ --o-filtered-table bushcamp_nohukamako_frequency-filtered-table.qza #add pseudocounts qiime composition add-pseudocount \ --i-table bushcamp_nohukamako_frequency-filtered-table.qza \ --o-composition-table comp_bush_camp_table.qza #run ANCOM on the bush_camp column qiime composition ancom \ --i-table comp_bush_camp_table.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column bush_camp \ --o-visualization ancom_bush_camp.qzv #collapse features to the family level qiime taxa collapse \ --i-table bushcamp_nohukamako_frequency-filtered-table.qza \ --i-taxonomy /data/taxonomy.qza \ --p-level 5 \ --o-collapsed-table bush_camp_table_l5.qza qiime composition add-pseudocount \ --i-table bush_camp_table_l5.qza \ --o-composition-table comp_bush_camp_table_l5.qza qiime composition ancom \ --i-table comp_bush_camp_table_l5.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column bush_camp \ --o-visualization l5_ancom_bush_camp.qzv #Differential abundance analysis of all fecal samples by sex #Filter out low frequency ASV #Total number of reads = 12,115,030 -> 12115030 x 0.00005 =605.75 -> will set p-min at 6060 qiime feature-table filter-features \ --i-table human-fecal-3A-table.qza \ --p-min-frequency 606 \ --o-filtered-table human-fecal-3A-table-2.qza #Filter out non-bacterial sequences prior to differential abundance tests qiime taxa filter-table \ --i-table human-fecal-3A-table-2.qza \ --i-taxonomy /data/taxonomy.qza \ --p-exclude mitochondria,chloroplast,archaea \ --o-filtered-table human-fecal-3A-table-3.qza #Perform differential abundance testing using ANCOM qiime composition add-pseudocount \ --i-table human-fecal-3A-table-3.qza \ --o-composition-table comp-fecal-3A-table.qza qiime composition ancom \ --i-table comp-fecal-3A-table.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column sex \ --o-visualization ancom-fecal-sex.qzv #Repeat ANCOM but only at the genus level (level 6) qiime taxa collapse \ --i-table human-fecal-3A-table-3.qza \ --i-taxonomy /data/taxonomy.qza \ --p-level 6 \ --o-collapsed-table human-fecal-3A-table-lvl6.qza qiime composition add-pseudocount \ --i-table human-fecal-3A-table-lvl6.qza \ --o-composition-table comp-fecal-3A-table-lvl6.qza qiime composition ancom \ --i-table comp-fecal-3A-table-lvl6.qza \ --m-metadata-file /data/tanzania_metadata.txt \ --m-metadata-column sex \ --o-visualization ancom-fecal-sex-lvl6.qzv #Longitudinal analysis # Longitudinal analysis # add taxonomy data to .biom file biom add-metadata -i feature-table.biom -o table-with-taxonomy-filtered.biom -m /data/Research_Aim_1/tanzania_metadata_season_camp_noinfant.tsv --observation-metadata-fp taxonomy.tsv --sc-separated taxonomy ## dry season qiime longitudinal pairwise-distances \ --i-distance-matrix /data/Research_Aim_1/core-metrics-results_filtered/unweighted_unifrac_distance_matrix.qza \ --m-metadata-file /data/Research_Aim_1/tanzania_metadata_season_camp_noinfant.tsv \ --p-group-column host_life_stage \ --p-state-column season \ --p-state-1 Dry13 \ --p-state-2 Dry14 \ --p-individual-id-column unique_name_type_season \ --p-replicate-handling random \ --o-visualization dry-season-pairwise-differences_filtered.qzv ## wet to dry 2013, following the figure from the original paper. The wet seasons has been combined qiime longitudinal pairwise-distances \ --i-distance-matrix /data/Research_Aim_1/core-metrics-results_filtered/unweighted_unifrac_distance_matrix.qza \ --m-metadata-file /data/Research_Aim_1/tanzania_metadata_season_camp_noinfant.tsv \ --p-group-column host_life_stage \ --p-state-column season \ --p-state-1 Dry13 \ --p-state-2 Wet14 \ --p-individual-id-column unique_name_type_season \ --p-replicate-handling random \ --o-visualization wet13-season-pairwise-differences_filtered.qzv ## wet 2014 to dry 2014, following the figure from the original paper qiime longitudinal pairwise-distances \ --i-distance-matrix /data/Research_Aim_1/core-metrics-results_filtered/unweighted_unifrac_distance_matrix.qza \ --m-metadata-file /data/Research_Aim_1/tanzania_metadata_season_camp_noinfant.tsv \ --p-group-column host_life_stage \ --p-state-column season \ --p-state-1 Dry14 \ --p-state-2 Wet14 \ --p-individual-id-column unique_name_type_season \ --p-replicate-handling random \ --o-visualization wet14-season-pairwise-differences_filtered.qzv