#!/bin/bash # Log in to Linux System and Server wsl sshroot@ # Creation of project 2 directory and navigation to it. cd / cd data mkdir parkinsons_project_2 cd parkinsons_project_2 # Obtaining demux files qiime tools import \ --type 'SampleData[SequencesWithQuality]' \ --input-path /mnt/datasets/project_2/parkinsons/parkinsons_manifest.txt \ --output-path demux.qza \ --input-format SingleEndFastqManifestPhred33V2 # Creating visualization of demultiplexed samples qiime demux summarize \ --i-data demux.qza \ --o-visualization demux.qzv # Determining ASVs with DADA2 qiime dada2 denoise-single \ --i-demultiplexed-seqs demux.qza \ --p-trim-left 0 \ --p-trunc-len 251 \ --o-representative-sequences rep-seqs.qza \ --o-table table.qza \ --o-denoising-stats stats.qza # Visualize DADA2 stats qiime metadata tabulate \ --m-input-file stats.qza \ --o-visualization stats.qzv # Visualize ASVs stats qiime feature-table summarize \ --i-table table.qza \ --o-visualization table.qzv \ --m-sample-metadata-file /mnt/datasets/project_2/parkinsons/parkinsons_metadata.txt qiime feature-table tabulate-seqs \ --i-data rep-seqs.qza \ --o-visualization rep-seqs.qzv # Filter Metadata for non-PD (Steps 1a, 2a, 3a) qiime feature-table filter-samples \ --i-table table.qza \ --m-metadata-file /mnt/datasets/project_2/parkinsons/parkinsons_metadata.txt \ --p-where "[Disease]='Control' " \ --o-filtered-table Control-filtered-table.qza # Filter out ‘null’ sleep problems qiime feature-table filter-samples \ --i-table Control-filtered-table.qza \ --m-metadata-file /mnt/datasets/project_2/parkinsons/parkinsons_metadata.txt \ --p-where "[Sleep_problems] IN ('Yes', 'No')" \ --o-filtered-table Control-Sleep-filtered-table.qza # Visualize feature table filtered for non-PD and ‘yes’ and ‘no’ sleep problems qiime feature-table summarize \ --i-table Control-Sleep-filtered-table.qza\ --o-visualization Control-Sleep-filtered-table.qzv \ --m-sample-metadata-file /mnt/datasets/project_2/parkinsons/parkinsons_metadata.txt # Feature based filtering - 1,095,289x0.00005= 54.76445 qiime feature-table filter-features \ --i-table Control-Sleep-filtered-table.qza \ --p-min-frequency 55 \ --o-filtered-table feature-frequency-filtered-table.qza # Taxonomy table qiime feature-classifier classify-sklearn \ --i-classifier /mnt/datasets/classifiers/gg-13-8-99-515-806-nb-classifier.qza \ --i-reads rep-seqs.qza \ --o-classification taxonomy.qza qiime metadata tabulate \ --m-input-file taxonomy.qza \ --o-visualization taxonomy.qzv # Filter out mitochondria, chloroplast, and archaea sequences qiime taxa filter-table \ --i-table feature-frequency-filtered-table.qza \ --i-taxonomy taxonomy.qza \ --p-exclude mitochondria,chloroplast,archaea \ --o-filtered-table no-m-c-a-feature-frequency-filtered-table.qza # Visualize feature table filtered for non-PD and ‘yes’ and ‘no’ sleep problems and without # mitochondria, chloroplast, and archaea sequences qiime feature-table summarize \ --i-table no-m-c-a-feature-frequency-filtered-table.qza \ --o-visualization no-m-c-a-feature-frequency-filtered-table.qzv \ --m-sample-metadata-file /mnt/datasets/project_2/parkinsons/parkinsons_metadata.txt # Taxonomic bar plot or Taxonomic analysis (Step 3b) qiime taxa barplot \ --i-table no-m-c-a-feature-frequency-filtered-table.qza \ --i-taxonomy taxonomy.qza \ --m-metadata-file /mnt/datasets/project_2/parkinsons/parkinsons_metadata.txt \ --o-visualization taxa-bar-plots.qzv # Generation of a tree for phylogenetic diversity analyses qiime phylogeny align-to-tree-mafft-fasttree \ --i-sequences rep-seqs.qza \ --o-alignment aligned-rep-seqs.qza \ --o-masked-alignment masked-aligned-rep-seqs.qza \ --o-tree unrooted-tree.qza \ --o-rooted-tree rooted-tree.qza # Generation of Alpha Rarefaction Plot (rarefaction depth will be picked from filtered table) qiime diversity alpha-rarefaction \ --i-table no-m-c-a-feature-frequency-filtered-table.qza \ --i-phylogeny rooted-tree.qza \ --p-max-depth 12100 \ --m-metadata-file /mnt/datasets/project_2/parkinsons/parkinsons_metadata.txt\ --o-visualization alpha-rarefaction.qzv # Calculation of alpha-diversity and beta-diversity metrics qiime diversity core-metrics-phylogenetic \ --i-phylogeny rooted-tree.qza \ --i-table no-m-c-a-feature-frequency-filtered-table.qza \ --p-sampling-depth 10403\ --m-metadata-file /mnt/datasets/project_2/parkinsons/parkinsons_metadata.txt\ --output-dir core-metrics-results # Creation of Alpha Diversity Plots (Step 1b) #Creation of Pielou’s Evenness Plot qiime diversity alpha-group-significance \ --i-alpha-diversity core-metrics-results/evenness_vector.qza \ --m-metadata-file /mnt/datasets/project_2/parkinsons/parkinsons_metadata.txt\ --o-visualization core-metrics-results/evenness-group-significance.qzv #Creation of Observed Features Plot qiime diversity alpha-group-significance \ --i-alpha-diversity core-metrics-results/observed_features_vector.qza \ --m-metadata-file /mnt/datasets/project_2/parkinsons/parkinsons_metadata.txt\ --o-visualization core-metrics-results/observed-features-group-significance.qzv # Creation of Beta Diversity Plots (Step 2b) #Creation of Weighted Unifrac Box plot and PCoA plot qiime diversity beta-group-significance \ --i-distance-matrix core-metrics-results/weighted_unifrac_distance_matrix.qza \ --m-metadata-file /mnt/datasets/project_2/parkinsons/parkinsons_metadata.txt\ --m-metadata-column Sleep_problems \ --o-visualization core-metrics-results/weighted-unifrac-sleep-problems-significance.qzv qiime emperor plot \ --i-pcoa core-metrics-results/weighted_unifrac_pcoa_results.qza \ --m-metadata-file /mnt/datasets/project_2/parkinsons/parkinsons_metadata.txt\ --o-visualization core-metrics-results/weighted-unifrac-emperor-sleep-problems.qzv # BETA DIVERSITY ANALYSIS # Loading all required packages for generating beta plots library(tidyverse) library(vegan) library(phyloseq) library(DESeq2) library(ape) # Helper functions gm_mean <- function(x, na.rm = TRUE){ exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x)) } # Setup helper functions (ignore the setup, the functions will help you) # Calculate relative abundance calculate_RA <- function(x) x/sum(x) # Importing all reqruired data files for beta analysis biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("parkinsons_metadata.txt") tree <- read_tree_greengenes("tree.nwk") tree = multi2di(tree) # Combine all objects into phyloseq objects physeq <- merge_phyloseq(biom_file, metadata, tree) # Generating beta diversity PCoA plot # Diversity requires rarefied taxa tables # We know from Qiime that we should rarefy to 10403 so gut # samples for both groups with and without sleep problems level out physeq_rar <- rarefy_even_depth(physeq, sample.size = 10403) # BETA DIVERSITY ANALYSIS FOR SAMPLES SEPARATED BY SLEEP PROBLEMS # Only keep control samples nopd <- subset_samples(physeq_rar, Disease == "Control") nopd nonull <- subset_samples(nopd, Sleep_problems != "") nonull # Convert to RA (relative abundance) physeq_rar_RA <- transform_sample_counts(nonull, function(x) x/sum(x)) # Calculate number of samples in 'Yes' and 'No' sleep problems groups summarise(group_by(sample_data(physeq_rar_RA),Sleep_problems),count=length(Sleep_problems)) # We define the type of analysis with ordinate() and setting the # method = PCoA for principal coordinate analysis, setting distance to type of beta diversity analysis # Can change distance parameter within the ordinate function to jaccard (jaccard), # bray-curtis (bray), weighted unifrac (wunifrac) or unweighted unifrac (unifrac). ord <- ordinate(physeq_rar_RA, method = "PCoA", distance = "wunifrac") # Plot data (This function already calls ggplot), change title according to # the type of beta analysis that was run, change colour parameter based on metadata category tested plot_ordination(physeq_rar_RA, ord, type = "sample", color = "Sleep_problems", title = "PCoA (Weighted Unifrac)") + # Adding txt to plot annotate(geom = "text", label ="", x = - 0.08, y = 0.18, size = 4) + # Add ellipse around data points of same group stat_ellipse(type = "norm", size = 1) + # Change legend title guides(colour = guide_legend("Sleep problems")) + theme_bw(base_size = 14) # BETA DIVERSITY ANALYSIS FOR SAMPLES SEPARATED BY COFFEE HABITS # Only keep control samples nopd <- subset_samples(physeq_rar, Disease == "Control") nopd nocoffeenull <- subset_samples(nopd, Coffe_drinker != "") nocoffeenull # Convert to RA (relative abundance) physeq_rar_RA <- transform_sample_counts(nocoffeenull, function(x) x/sum(x)) # Calculate number of samples in 'Yes' and 'No' sleep problems groups summarise(group_by(sample_data(physeq_rar_RA),Coffe_drinker),count=length(Coffe_drinker)) # We define the type of analysis with ordinate() and setting the # method = PCoA for principal coordinate analysis, setting distance to type of beta diversity analysis # Can change distance parameter within the ordinate function to jaccard (jaccard), # bray-curtis (bray), weighted unifrac (wunifrac) or unweighted unifrac (unifrac). ord <- ordinate(physeq_rar_RA, method = "PCoA", distance = "wunifrac") # Plot data (This function already calls ggplot), change title according to # the type of beta analysis that was run, change colour parameter based on metadata category tested plot_ordination(physeq_rar_RA, ord, type = "sample", color = "Coffe_drinker", title = "PCoA (Weighted Unifrac)") + # Adding txt to plot annotate(geom = "text", label ="", x = - 0.08, y = 0.18, size = 4) + # Add ellipse around data points of same group stat_ellipse(type = "norm", size = 1) + # Change legend title guides(colour = guide_legend("Coffee drinker")) + theme_bw(base_size = 14) # BETA DIVERSITY ANALYSIS FOR SAMPLES SEPARATED BY SEX # Only keep control samples nopd <- subset_samples(physeq_rar, Disease == "Control") nopd # Convert to RA (relative abundance) physeq_rar_RA <- transform_sample_counts(nopd, function(x) x/sum(x)) # Calculate number of samples in 'Yes' and 'No' sleep problems groups summarise(group_by(sample_data(physeq_rar_RA),Sex),count=length(Sex)) # We define the type of analysis with ordinate() and setting the # method = PCoA for principal coordinate analysis, setting distance to type of beta diversity analysis # Can change distance parameter within the ordinate function to jaccard (jaccard), # bray-curtis (bray), weighted unifrac (wunifrac) or unweighted unifrac (unifrac). ord <- ordinate(physeq_rar_RA, method = "PCoA", distance = "wunifrac") # Plot data (This function already calls ggplot), change title according to # the type of beta analysis that was run, change colour parameter based on metadata category tested plot_ordination(physeq_rar_RA, ord, type = "sample", color = "Sex", title = "PCoA (Weighted Unifrac)") + # Adding txt to plot annotate(geom = "text", label ="", x = - 0.08, y = 0.18, size = 4) + # Add ellipse around data points of same group stat_ellipse(type = "norm", size = 1) + # Change legend title guides(colour = guide_legend("Sex")) + theme_bw(base_size = 14) # BETA DIVERSITY ANALYSIS FOR SAMPLES SEPARATED BY ANTIDEPRESSANT USE # Only keep control samples nopd <- subset_samples(physeq_rar, Disease == "Control") nopd noantidepressantnull <- subset_samples(nopd, Antidepressant_use != "") noantidepressantnull # Convert to RA (relative abundance) physeq_rar_RA <- transform_sample_counts(noantidepressantnull, function(x) x/sum(x)) # Calculate number of samples in 'Yes' and 'No' sleep problems groups summarise(group_by(sample_data(physeq_rar_RA),Antidepressant_use),count=length(Antidepressant_use)) # We define the type of analysis with ordinate() and setting the # method = PCoA for principal coordinate analysis, setting distance to type of beta diversity analysis # Can change distance parameter within the ordinate function to jaccard (jaccard), # bray-curtis (bray), weighted unifrac (wunifrac) or unweighted unifrac (unifrac). ord <- ordinate(physeq_rar_RA, method = "PCoA", distance = "wunifrac") # Plot data (This function already calls ggplot), change title according to # the type of beta analysis that was run, change colour parameter based on metadata category tested plot_ordination(physeq_rar_RA, ord, type = "sample", color = "Antidepressant_use", title = "PCoA (Weighted Unifrac)") + # Adding txt to plot annotate(geom = "text", label ="", x = - 0.08, y = 0.18, size = 4) + # Add ellipse around data points of same group stat_ellipse(type = "norm", size = 1) + # Change legend title guides(colour = guide_legend("Antidepressant use")) + theme_bw(base_size = 14) # DIFFERENTIAL ABUNDANCE ANALYSIS, RELATIVE ABUNDANCE OF FIRMICUTES AND BACTEROIDETES, # FIRMICUTES TO BACTEROIDETES RATIO ANALYSIS # Load CRAN packages library(tidyverse) library(vegan) # Load Bioconductor packages library(phyloseq) library(DESeq2) library (rstatix) library(ape) # Setup helper functions (ignore the setup, the functions will help you) # Calculate relative abundance calculate_RA <- function(x) x/sum(x) # define this function below, which we'll use in a minute gm_mean <- function(x, na.rm = TRUE) { exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x)) } # Export biom file and tree from QIIME2 and provide original metadata file biom_file <- import_biom("table-with-taxonomy.biom") metadata <- import_qiime_sample_data("parkinsons_metadata.txt") tree <- read_tree_greengenes("tree.nwk") tree = multi2di(tree) # Combine all information into a single phyloseq object physeq <- merge_phyloseq(biom_file, metadata, tree) physeq # Convert taxonomic rank from numbers to proper names colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species") # Keep only abundant ASVs # First determine counts of ASV across all samples total_counts <- taxa_sums(physeq) # Calculate relative abundance for each ASV relative_abundance <- calculate_RA(total_counts) # Determine which ASVs are more abundant than 0.1% # Change this if you want a different cutoff (0.001 = 0.1%) abundant <- relative_abundance > 0.001 abundant_taxa <- prune_taxa(abundant, physeq) # Check the resulting new phyloseq object with much fewer taxa abundant_taxa # now your phyloseq object is called "Genus" phylum <- tax_glom(abundant_taxa, taxrank = "Phylum", NArm = FALSE) phylum # Only keep control samples nopd <- subset_samples(phylum, Disease == "Control") nopd nonull <- subset_samples(nopd, Sleep_problems != "") nonull # Let's limit to control samples that we filtered above deseq_feature <- phyloseq_to_deseq2(nonull, ~ Sleep_problems) # now the rest of the deseq2 code: geo_means <- apply(counts(deseq_feature), 1, gm_mean) deseq_feature <- estimateSizeFactors(deseq_feature, geoMeans = geo_means) deseq_feature <- DESeq(deseq_feature, fitType="local") feature_diff_abund <- results(deseq_feature) # Define your cutoff for the adjusted p-value alpha <- 0.05 # Reformat information as data frame including feature as variable significant_feature <- as_tibble(feature_diff_abund, rownames = "feature") # Keep only significant results and sort by adjusted p-value significant_feature <- filter(significant_feature, padj < alpha) significant_feature <- arrange(significant_feature, padj) # Get the taxonomic information as a data frame taxa_df <- as_tibble(as.data.frame(tax_table(physeq)), rownames = "feature") # Combine the significant features with taxonomic classification significant_feature <- inner_join(significant_feature, taxa_df) # there 14 features that are different at FDR-corrected p < 0.05! dim(significant_feature) # Plot differential abundance significant_feature %>% ggplot(aes(x = log2FoldChange, y = Phylum)) + geom_col() # Making plots showing relative abundance # First thing to do is you need to convert to relative abundance # Sticking with family level for this example physeq_RA <- transform_sample_counts(physeq, calculate_RA) phylum_RA <- tax_glom(physeq_RA, taxrank = "Phylum", NArm = FALSE) # Let's graph Porphyromonadaceae and filter for that family with subset_taxa. # porphyromonadaceae <- subset_taxa(family_RA, Family == "f__Porphyromonadaceae") phylum_comparison <- subset_taxa(phylum_RA, Phylum == "p__Firmicutes" | Phylum == "p__Bacteroidetes") # The next step enables us to plot the data by changing the table from # wide to long format. phylum_comparison_melt <- psmelt(phylum_comparison) # Plot with ggplot function # Define what dataset to plot and what variable on x-axis and y-axis ratio <- ggplot(phylum_comparison_melt, aes(x = as.factor(Sleep_problems), y = Abundance, color=Phylum)) + # Visualize data as Boxplot geom_boxplot() + # Label for x-axis xlab("Sleep Problems") + # Label for y-axis ylab("Relative Abundance") + # Plot title ggtitle("") + # colors for the different groups (here subject 1 and 2) # scale_fill_manual(values=c("seagreen3", "indianred1")) + guides(fill = FALSE) + # Set a standard theme for plot theme_bw(base_size = 16) ratio #Run ANOVA. p-value tells you that none of the 4 bins in the boxplot are significantly #different from each other. phylum_comparison_lm <- lm(Abundance~Sleep_problems, data=phylum_comparison_melt) summary(aov(phylum_comparison_lm)) #Is there a significant different between Firmicutes and Bacteroidetes for people without sleep problems. phylum_comparison_no_lm <- lm(Abundance~Phylum, data=filter(phylum_comparison_melt, Sleep_problems == "No")) summary(aov(phylum_comparison_no_lm)) #P-value for comparison between all groups phylum_comparison_yes_lm <- lm(Abundance~Phylum, data=filter(phylum_comparison_melt, Sleep_problems == "Yes")) summary(aov(phylum_comparison_yes_lm)) #P-value for comparison between firmicutes firmicutes_comparison_lm <- lm(Abundance~Sleep_problems, data=filter(phylum_comparison_melt,Phylum=="p__Firmicutes")) summary(aov(firmicutes_comparison_lm)) TukeyHSD(aov(firmicutes_comparison_lm)) #P-value for comparison between bacteriodetes bacteroidetes_comparison_lm <- lm(Abundance~Sleep_problems, data=filter(phylum_comparison_melt,Phylum=="p__Bacteroidetes")) summary(aov(bacteroidetes_comparison_lm)) TukeyHSD(aov(bacteroidetes_comparison_lm)) ### significant of firmicutes/bacteroidetes ratio differing between sleep problems and no sleep problems phylum_comparison_melt = pivot_wider(phylum_comparison_melt,id_cols=-1,names_from="Phylum",values_from="Abundance") phylum_comparison_melt$ratio = phylum_comparison_melt$p__Firmicutes/phylum_comparison_melt$p__Bacteroidetes ggplot(phylum_comparison_melt,aes(x=Sleep_problems,y=log(ratio,base=2))) + geom_boxplot() + # Label for x-axis xlab("Sleep Problems") + # Label for y-axis ylab("Log Ratio of Firmicutes to Bacteroidetes") + # Plot title ggtitle("") + # Set a standard theme for plot theme_bw(base_size = 16) #Deteremine firmicutes/bacteroidetes ratio P-Value ratio_comparison_lm <- lm(log(ratio,base=2)~Sleep_problems, data=phylum_comparison_melt) summary(aov(ratio_comparison_lm))