<script1.sh>
#!/bin/bash

mkdir Parkinsons_Data
cd Parkinsons_Data

#I did not realize there were already metadata on the server so I took the excel file on the course page, pasted into google sheets and saved as a .tsv. Then I copied it to the server in which it was called "Parkinsons_Metadata"
cp -R /mnt/datasets/project_2/parkinsons/seqs /root/Parkinsons_Data
mv parkinsons_manifest.txt parkinsons_manifest.tsv
cp mnt/datasets/project_2/parkinsons/

#Putting metadata into a QIIME2 visual format
qiime metadata tabulate \
  --m-input-file parkinsons_metadata.tsv \
  --o-visualization parkinsons_metadata.qzv

#Creating a demultiplexed sequence summary artifact
qiime tools import \
  --type "SampleData[SequencesWithQuality]" \
  --input-format SingleEndFastqManifestPhred33V2 \
  --input-path ./parkinsons_manifest.tsv \
  --output-path ./demux_seqs.qza

#Creating a demultiplexed sequence summary table
qiime demux summarize \
  --i-data ./demux_seqs.qza \
  --o-visualization ./demux_seqs.qzv

#Denoising our sequences and truncation to 249 because 249 was the smallest read length
qiime dada2 denoise-single \
  --i-demultiplexed-seqs ./demux_seqs.qza \
  --p-trunc-len 249 \
  --o-table ./dada2_table.qza \
  --o-representative-sequences ./dada2_rep_set.qza \
  --o-denoising-stats ./dada2_stats.qza

#Create a visualization for the data after denoising
qiime metadata tabulate \
  --m-input-file ./dada2_stats.qza  \
  --o-visualization ./dada2_stats.qzv

#Creating the Feature Table
qiime feature-table summarize \
  --i-table ./dada2_table.qza \
  --m-sample-metadata-file ./parkinsons_metadata.tsv \
  --o-visualization ./dada2_table.qzv

#Downloaded the greengenes 13_8 reference sequence for phylogenetic relationships also looks at specific region of 16S rRNA gene (V4 region) - 515F/806R
wget \
  -O "sepp-refs-gg-13-8.qza" \
  "https://data.qiime2.org/2020.11/common/sepp-refs-gg-13-8.qza"

#Introducing the representative sequence for creation of phylogenetic relationships
qiime fragment-insertion sepp \
  --i-representative-sequences ./dada2_rep_set.qza \
  --i-reference-database sepp-refs-gg-13-8.qza \
  --o-tree ./tree.qza \
  --o-placements ./tree_placements.qza \
  --p-threads 1

#Creating alpha rarefaction table from sampling depth 100 to 19000
qiime diversity alpha-rarefaction \
  --i-table ./dada2_table.qza \
  --m-metadata-file ./parkinsons_metadata.tsv \
  --o-visualization ./alpha_rarefaction_curves.qzv \
  --p-min-depth 100 \
  --p-max-depth 19000

#Creating many PCoA plots of various beta diversity metrics while also creating artifacts for further analysis
qiime diversity core-metrics-phylogenetic \
  --i-table ./dada2_table.qza \
  --i-phylogeny ./tree.qza \
  --m-metadata-file ./parkinsons_metadata.tsv \
  --p-sampling-depth 6000 \
  --output-dir ./core-metrics-results_6000

#Creating Faith Phylogenetic diversity boxplots
qiime diversity alpha-group-significance \
 --i-alpha-diversity ./core-metrics-results_6000/faith_pd_vector.qza \
 --m-metadata-file ./parkinsons_metadata.tsv \
 --o-visualization ./core-metrics-results_6000/faiths-diversity.qzv

#Creating Evenness Boxplots
qiime diversity alpha-group-significance \
 --i-alpha-diversity ./core-metrics-results_6000/evenness_vector.qza \
 --m-metadata-file ./parkinsons_metadata.tsv \
 --o-visualization ./core-metrics-results_6000/evenness_statistics.qzv

#Creating unweighted unifrac boxplot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000/unweighted_unifrac_distance_matrix.qza \
  --m-metadata-file parkinsons_metadata.tsv \
  --m-metadata-column Disease \
  --o-visualization core-metrics-results_6000/unweighted-unifrac-Disease-significance.qzv

#Creating weighted unifrac boxplot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000/weighted_unifrac_distance_matrix.qza \
  --m-metadata-file parkinsons_metadata.tsv \
  --m-metadata-column Disease \
  --o-visualization core-metrics-results_6000/weighted-unifrac-Disease-significance.qzv

#Bray Curtis Box Plot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000/bray_curtis_distance_matrix.qza \
  --m-metadata-file parkinsons_metadata.tsv \
  --m-metadata-column Disease \
  --o-visualization core-metrics-results_6000/bray-curtis-Disease-significance.qzv

#Jaccard Box Plot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000/jaccard_distance_matrix.qza \
  --m-metadata-file parkinsons_metadata.tsv \
  --m-metadata-column Disease \
  --o-visualization core-metrics-results_6000/jaccard-Disease-significance.qzv

wget \
  -O "gg-13-8-99-515-806-nb-classifier.qza" \
  "https://data.qiime2.org/2021.2/common/gg-13-8-99-515-806-nb-classifier.qza"

#Using pre-trained classifier from green genes to setup taxonomy artifact
qiime feature-classifier classify-sklearn \
  --i-reads ./dada2_rep_set.qza \
  --i-classifier ./gg-13-8-99-515-806-nb-classifier.qza \
  --o-classification ./taxonomy.qza

#Creating Taxonomy classification of ASVs
qiime metadata tabulate \
  --m-input-file ./taxonomy.qza \
  --o-visualization ./taxonomy.qzv

#Creating a table of sequences within the taxonomy
qiime feature-table tabulate-seqs \
  --i-data ./dada2_rep_set.qza \
  --o-visualization ./dada2_rep_set.qzv

#Creating feature table of only 6000 sampling depth
qiime feature-table filter-samples \
  --i-table ./dada2_table.qza \
  --p-min-frequency 6000 \
  --o-filtered-table ./table_6k.qza

#Creating the taxonomy Barplot
qiime taxa barplot \
  --i-table ./table_6k.qza \
  --i-taxonomy ./taxonomy.qza \
  --m-metadata-file ./parkinsons_metadata.tsv \
  --o-visualization ./taxa_barplot_6000.qzv


#Filtering Steps


#Filtering metadata so table only shows Healthy Samples
qiime feature-table filter-samples \
  --i-table dada2_table.qza \
  --m-metadata-file parkinsons_metadata.tsv \
  --p-where "[Disease]='Control' AND NOT [Coffe_drinker]='null'" \
  --o-filtered-table Healthy_Coffee_only_table.qza

#Creating healthy only - feature table visualization
qiime feature-table summarize \
  --i-table ./Healthy_Coffee_only_table.qza \
  --m-sample-metadata-file ./parkinsons_metadata.tsv \
  --o-visualization ./Healthy_Coffee_only_table.qzv


#Filtering metadata so table only shows PD Samples
qiime feature-table filter-samples \
  --i-table dada2_table.qza \
  --m-metadata-file parkinsons_metadata.tsv \
  --p-where "[Disease]='PD' AND NOT [Coffe_drinker]='null'" \
  --o-filtered-table PD_Coffee_only_table.qza

#Creating PD only - feature table visualization
qiime feature-table summarize \
  --i-table ./PD_Coffee_only_table.qza \
  --m-sample-metadata-file ./parkinsons_metadata.tsv \
  --o-visualization ./PD_Coffee_only_table.qzv


#Filtering metadata so table only shows Healthy Samples
qiime feature-table filter-samples \
  --i-table dada2_table.qza \
  --m-metadata-file Parkinsons_Metadata_Abx.tsv \
  --p-where "[Disease]='Control' AND NOT [Abx_doses_last_five_years]='null'" \
  --o-filtered-table Healthy_Abx_only_table.qza

#Creating healthy only - feature table visualization
qiime feature-table summarize \
  --i-table ./Healthy_Abx_only_table.qza \
  --m-sample-metadata-file ./Parkinsons_Metadata_Abx.tsv \
  --o-visualization ./Healthy_Abx_only_table.qzv

#Filtering metadata so table only shows PD Samples
qiime feature-table filter-samples \
  --i-table dada2_table.qza \
  --m-metadata-file Parkinsons_Metadata_Abx.tsv \
  --p-where "[Disease]='PD' AND NOT [Abx_doses_last_five_years]='null'" \
  --o-filtered-table PD_Abx_only_table.qza

#Creating PD only - feature table visualization
qiime feature-table summarize \
  --i-table ./PD_Abx_only_table.qza \
  --m-sample-metadata-file ./Parkinsons_Metadata_Abx.tsv \
  --o-visualization ./PD_Abx_only_table.qzv

#Analyzing Coffee-Drinking in Healthy controls


qiime diversity core-metrics-phylogenetic \
  --i-table ./Healthy_Coffee_only_table.qza \
  --i-phylogeny ./tree.qza \
  --m-metadata-file ./parkinsons_metadata.tsv \
  --p-sampling-depth 6000 \
  --output-dir ./core-metrics-results_6000_HealthyCoffee

#Creating Faith Phylogenetic diversity boxplots
qiime diversity alpha-group-significance \
 --i-alpha-diversity ./core-metrics-results_6000_HealthyCoffee/faith_pd_vector.qza \
 --m-metadata-file ./parkinsons_metadata.tsv \
 --o-visualization ./core-metrics-results_6000_HealthyCoffee/faiths-diversity.qzv

#Creating Evenness Boxplots
qiime diversity alpha-group-significance \
 --i-alpha-diversity ./core-metrics-results_6000_HealthyCoffee/evenness_vector.qza \
 --m-metadata-file ./parkinsons_metadata.tsv \
 --o-visualization ./core-metrics-results_6000_HealthyCoffee/evenness_statistics.qzv

#Creating unweighted unifrac boxplot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_HealthyCoffee/unweighted_unifrac_distance_matrix.qza \
  --m-metadata-file parkinsons_metadata.tsv \
  --m-metadata-column Coffe_drinker \
  --o-visualization core-metrics-results_6000_HealthyCoffee/unweighted-unifrac-Coffee-significance.qzv

#Creating weighted unifrac boxplot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_HealthyCoffee/weighted_unifrac_distance_matrix.qza \
  --m-metadata-file parkinsons_metadata.tsv \
  --m-metadata-column Coffe_drinker \
  --o-visualization core-metrics-results_6000_HealthyCoffee/weighted-unifrac-Coffee-significance.qzv

#Bray Curtis Box Plot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_HealthyCoffee/bray_curtis_distance_matrix.qza \
  --m-metadata-file parkinsons_metadata.tsv \
  --m-metadata-column Coffe_drinker \
  --o-visualization core-metrics-results_6000_HealthyCoffee/bray-curtis-Coffee-significance.qzv

#Jaccard Box Plot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_HealthyCoffee/jaccard_distance_matrix.qza \
  --m-metadata-file parkinsons_metadata.tsv \
  --m-metadata-column Coffe_drinker \
  --o-visualization core-metrics-results_6000_HealthyCoffee/jaccard-Coffee-significance.qzv


#Creating feature table of only 6000 sampling depth
qiime feature-table filter-samples \
  --i-table ./Healthy_Coffee_only_table.qza \
  --p-min-frequency 6000 \
  --o-filtered-table ./Healthy_Coffee_only_table_6k.qza

#Creating the taxonomy Barplot
qiime taxa barplot \
  --i-table ./Healthy_Coffee_only_table_6k.qza \
  --i-taxonomy ./taxonomy.qza \
  --m-metadata-file ./parkinsons_metadata.tsv \
  --o-visualization ./taxa_barplot_Healthy_Coffee_only_6000.qzv


#Analyzing Coffee-Drinking in PD Patients


qiime diversity core-metrics-phylogenetic \
  --i-table ./PD_Coffee_only_table.qza \
  --i-phylogeny ./tree.qza \
  --m-metadata-file ./parkinsons_metadata.tsv \
  --p-sampling-depth 6000 \
  --output-dir ./core-metrics-results_6000_PD_Coffee

#Creating Faith Phylogenetic diversity boxplots
qiime diversity alpha-group-significance \
 --i-alpha-diversity ./core-metrics-results_6000_PD_Coffee/faith_pd_vector.qza \
 --m-metadata-file ./parkinsons_metadata.tsv \
 --o-visualization ./core-metrics-results_6000_PD_Coffee/faiths-diversity.qzv

#Creating Evenness Boxplots
qiime diversity alpha-group-significance \
 --i-alpha-diversity ./core-metrics-results_6000_PD_Coffee/evenness_vector.qza \
 --m-metadata-file ./parkinsons_metadata.tsv \
 --o-visualization ./core-metrics-results_6000_PD_Coffee/evenness_statistics.qzv

#Creating unweighted unifrac boxplot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_PD_Coffee/unweighted_unifrac_distance_matrix.qza \
  --m-metadata-file parkinsons_metadata.tsv \
  --m-metadata-column Coffe_drinker \
  --o-visualization core-metrics-results_6000_PD_Coffee/unweighted-unifrac-Coffee-significance.qzv

#Creating weighted unifrac boxplot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_PD_Coffee/weighted_unifrac_distance_matrix.qza \
  --m-metadata-file parkinsons_metadata.tsv \
  --m-metadata-column Coffe_drinker \
  --o-visualization core-metrics-results_6000_PD_Coffee/weighted-unifrac-Coffee-significance.qzv

#Bray Curtis Box Plot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_PD_Coffee/bray_curtis_distance_matrix.qza \
  --m-metadata-file parkinsons_metadata.tsv \
  --m-metadata-column Coffe_drinker \
  --o-visualization core-metrics-results_6000_PD_Coffee/bray-curtis-Coffee-significance.qzv

#Jaccard Box Plot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_PD_Coffee/jaccard_distance_matrix.qza \
  --m-metadata-file parkinsons_metadata.tsv \
  --m-metadata-column Coffe_drinker \
  --o-visualization core-metrics-results_6000_PD_Coffee/jaccard-Coffee-significance.qzv


#Creating feature table of only 6000 sampling depth
qiime feature-table filter-samples \
  --i-table ./PD_Coffee_only_table.qza \
  --p-min-frequency 6000 \
  --o-filtered-table ./PD_Coffee_only_table_6k.qza

#Creating the taxonomy Barplot
qiime taxa barplot \
  --i-table ./PD_Coffee_only_table_6k.qza \
  --i-taxonomy ./taxonomy.qza \
  --m-metadata-file ./parkinsons_metadata.tsv \
  --o-visualization ./taxa_barplot_PD_Coffee_only_6000.qzv


#Analyzing Abx in Healthy controls


qiime diversity core-metrics-phylogenetic \
  --i-table ./Healthy_Abx_only_table.qza \
  --i-phylogeny ./tree.qza \
  --m-metadata-file ./Parkinsons_Metadata_Abx.tsv \
  --p-sampling-depth 6000 \
  --output-dir ./core-metrics-results_6000_HealthyAbx

#Creating Faith Phylogenetic diversity boxplots
qiime diversity alpha-group-significance \
 --i-alpha-diversity ./core-metrics-results_6000_HealthyAbx/faith_pd_vector.qza \
 --m-metadata-file ./Parkinsons_Metadata_Abx.tsv \
 --o-visualization ./core-metrics-results_6000_HealthyAbx/faiths-diversity.qzv

#Creating Evenness Boxplots
qiime diversity alpha-group-significance \
 --i-alpha-diversity ./core-metrics-results_6000_HealthyAbx/evenness_vector.qza \
 --m-metadata-file ./Parkinsons_Metadata_Abx.tsv \
 --o-visualization ./core-metrics-results_6000_HealthyAbx/evenness_statistics.qzv

#Creating unweighted unifrac boxplot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_HealthyAbx/unweighted_unifrac_distance_matrix.qza \
  --m-metadata-file Parkinsons_Metadata_Abx.tsv \
  --m-metadata-column Abx_doses_last_five_years \
  --o-visualization core-metrics-results_6000_HealthyAbx/unweighted-unifrac-Abx-significance.qzv

#Creating weighted unifrac boxplot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_HealthyAbx/weighted_unifrac_distance_matrix.qza \
  --m-metadata-file Parkinsons_Metadata_Abx.tsv \
  --m-metadata-column Abx_doses_last_five_years \
  --o-visualization core-metrics-results_6000_HealthyAbx/weighted-unifrac-Abx-significance.qzv

#Bray Curtis Box Plot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_HealthyAbx/bray_curtis_distance_matrix.qza \
  --m-metadata-file Parkinsons_Metadata_Abx.tsv \
  --m-metadata-column Abx_doses_last_five_years \
  --o-visualization core-metrics-results_6000_HealthyAbx/bray-curtis-Abx-significance.qzv

#Jaccard Box Plot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_HealthyAbx/jaccard_distance_matrix.qza \
  --m-metadata-file Parkinsons_Metadata_Abx.tsv \
  --m-metadata-column Abx_doses_last_five_years \
  --o-visualization core-metrics-results_6000_HealthyAbx/jaccard-Abx-significance.qzv


#Creating feature table of only 6000 sampling depth
qiime feature-table filter-samples \
  --i-table ./Healthy_Abx_only_table.qza \
  --p-min-frequency 6000 \
  --o-filtered-table ./Healthy_Abx_only_table_6k.qza

#Creating the taxonomy Barplot
qiime taxa barplot \
  --i-table ./Healthy_Abx_only_table_6k.qza \
  --i-taxonomy ./taxonomy.qza \
  --m-metadata-file ./Parkinsons_Metadata_Abx.tsv \
  --o-visualization ./taxa_barplot_Healthy_Abx_only_6000.qzv


#Analyzing Abx in PD Patients


qiime diversity core-metrics-phylogenetic \
  --i-table ./PD_Abx_only_table.qza \
  --i-phylogeny ./tree.qza \
  --m-metadata-file ./Parkinsons_Metadata_Abx.tsv \
  --p-sampling-depth 6000 \
  --output-dir ./core-metrics-results_6000_PD_Abx

#Creating Faith Phylogenetic diversity boxplots
qiime diversity alpha-group-significance \
 --i-alpha-diversity ./core-metrics-results_6000_PD_Abx/faith_pd_vector.qza \
 --m-metadata-file ./Parkinsons_Metadata_Abx.tsv \
 --o-visualization ./core-metrics-results_6000_PD_Abx/faiths-diversity.qzv

#Creating Evenness Boxplots
qiime diversity alpha-group-significance \
 --i-alpha-diversity ./core-metrics-results_6000_PD_Abx/evenness_vector.qza \
 --m-metadata-file ./Parkinsons_Metadata_Abx.tsv \
 --o-visualization ./core-metrics-results_6000_PD_Abx/evenness_statistics.qzv

#Creating unweighted unifrac boxplot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_PD_Abx/unweighted_unifrac_distance_matrix.qza \
  --m-metadata-file Parkinsons_Metadata_Abx.tsv \
  --m-metadata-column Abx_doses_last_five_years \
  --o-visualization core-metrics-results_6000_PD_Abx/unweighted-unifrac-Abx-significance.qzv

#Creating weighted unifrac boxplot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_PD_Abx/weighted_unifrac_distance_matrix.qza \
  --m-metadata-file Parkinsons_Metadata_Abx.tsv \
  --m-metadata-column Abx_doses_last_five_years \
  --o-visualization core-metrics-results_6000_PD_Abx/weighted-unifrac-Abx-significance.qzv

#Bray Curtis Box Plot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_PD_Abx/bray_curtis_distance_matrix.qza \
  --m-metadata-file Parkinsons_Metadata_Abx.tsv \
  --m-metadata-column Abx_doses_last_five_years \
  --o-visualization core-metrics-results_6000_PD_Abx/bray-curtis-Abx-significance.qzv

#Jaccard Box Plot
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results_6000_PD_Abx/jaccard_distance_matrix.qza \
  --m-metadata-file Parkinsons_Metadata_Abx.tsv \
  --m-metadata-column Abx_doses_last_five_years \
  --o-visualization core-metrics-results_6000_PD_Abx/jaccard-Abx-significance.qzv


#Creating feature table of only 6000 sampling depth
qiime feature-table filter-samples \
  --i-table ./PD_Abx_only_table.qza \
  --p-min-frequency 6000 \
  --o-filtered-table ./PD_Abx_only_table_6k.qza

#Creating the taxonomy Barplot
qiime taxa barplot \
  --i-table ./PD_Abx_only_table_6k.qza \
  --i-taxonomy ./taxonomy.qza \
  --m-metadata-file ./Parkinsons_Metadata_Abx.tsv \
  --o-visualization ./taxa_barplot_PD_Abx_only_6000.qzv


#Exporting files to compare PD vs Healthy Question while also looking at coffee consumption in R

qiime tools export \
  --input-path dada2_table.qza
  --output-path exported 

qiime tools export \
  --input-path taxonomy.qza
  --output-path exported

qiime tools export \
  --input-path tree.qza
  --output-path exported 

cd exported

#Editing column names in newly made taxonomy TSV file
nano taxonomy.tsv

cd ..

#Creating biom file with taxonomy information to be used in R
biom add-metadata \
  --i exported/feature-table.biom \
  --o exported/table-with-taxonomy.biom \
  --observation-metadata-fp exported/taxonomy.tsv \
  --sc-separated taxonomy


#Exporting files to specifically do analysis on antibiotic consumption of PD patients

qiime tools export \
  --input-path PD_Abx_only_table.qza
  --output-path exported_Abx

qiime tools export \
  --input-path taxonomy.qza
  --output-path exported_Abx

qiime tools export \
  --input-path tree.qza
  --output-path exported_Abx

cd exported_Abx

#Change file name to not get it confused with BIOM file above
mv feature-table.biom PD_Abx.biom

#Editing column names in newly made taxonomy TSV file
nano taxonomy.tsv

cd ..

#Creating biom file with taxonomy information to be used in R
biom add-metadata \
  --i exported/PD_Abx.biom \
  --o exported/PD_Abx_only.biom \
  --observation-metadata-fp exported/taxonomy.tsv \
  --sc-separated taxonomy

#Exporting files to specifically do analysis on antibiotic consumption of Healthy participants

qiime tools export \
  --input-path Healthy_Abx_only_table.qza
  --output-path exported_Healthy_Abx

qiime tools export \
  --input-path taxonomy.qza
  --output-path exported_Healthy_Abx

qiime tools export \
  --input-path tree.qza
  --output-path exported_Healthy_Abx

cd exported_Healthy_Abx

#Change file name to not get it confused with BIOM file above
mv feature-table.biom Healthy_Abx.biom

#Editing column names in newly made taxonomy TSV file
nano taxonomy.tsv

cd ..

#Creating biom file with taxonomy information to be used in R
biom add-metadata \
  --i exported/Healthy_Abx.biom \
  --o exported/Healthy_Abx_only.biom \
  --observation-metadata-fp exported/taxonomy.tsv \
  --sc-separated taxonomy


#Then copied the BIOM, NWK, and corresponding metadata file onto local computer to do the R analysis


<script2.r>
# Load CRAN packages
library(tidyverse)
library(vegan)
library(ape)

# Load Bioconductor packages
library(phyloseq)
library(DESeq2)

# Calculate relative abundance
calculate_relative_abundance <- function(x) x / sum(x)

# Calculate geometric mean 
calculate_gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# Define the set of random numbers
set.seed(711)

# Import qiime files
biom_file <- import_biom("table-with-taxonomy.biom")
metadata <- import_qiime_sample_data("Parkinsons_Metadata.tsv")
tree <- read_tree_greengenes("tree.nwk")

# IMPORTANT!!! Convert tree from multichotomous to dichotomous tree
tree <- multi2di(tree)

# Combine information into single phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)

# Change taxonomic levels from numbers to actual names
colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class","Order", "Family",
                                 "Genus", "Species")
head(tax_table(physeq))

#Setting sampling depth
sample_sums(physeq) >= 6000

#Filter samples based on sampling depth
at_least_6000 <- prune_samples(sample_sums(physeq) >= 6000, physeq)
sample_sums(at_least_6000)

#Getting most abundant taxa at the genus level
counts <- taxa_sums(at_least_6000)
relative_abundance <- calculate_relative_abundance(counts)
abundant <- relative_abundance > 0.001
abundant_taxa <- prune_taxa(abundant, at_least_6000)

abundant_genera <- tax_glom(abundant_taxa, taxrank = "Genus")
abundant_genera

abundant_genera_family <- tax_glom(abundant_taxa, taxrank = "Family")
abundant_genera_family

abundant_genera_species <- tax_glom(abundant_taxa, taxrank = "Species")
abundant_genera_species

#DESeq2 Analysis Genus
deseq <- phyloseq_to_deseq2(abundant_genera, ~ Disease)
geo_means <- apply(counts(deseq), 1, calculate_gm_mean)
deseq <- estimateSizeFactors(deseq, geoMeans = geo_means)
deseq <- DESeq(deseq, fitType = "local")

diff_abund <- results(deseq)

#DESeq2 Analysis Family
deseq_family <- phyloseq_to_deseq2(abundant_genera_family, ~ Disease)
geo_means_family <- apply(counts(deseq_family), 1, calculate_gm_mean)
deseq_family <- estimateSizeFactors(deseq_family, geoMeans = geo_means_family)
deseq_family <- DESeq(deseq_family, fitType = "local")

diff_abund_family <- results(deseq_family)

#DESeq2 Analysis species
deseq_species <- phyloseq_to_deseq2(abundant_genera_species, ~ Disease)
geo_means_species <- apply(counts(deseq_species), 1, calculate_gm_mean)
deseq_species <- estimateSizeFactors(deseq_species, geoMeans = geo_means_species)
deseq_species <- DESeq(deseq_species, fitType = "local")

diff_abund_species <- results(deseq_species)

#Filter data based on alpha < 0.05 for genus
alpha <- 0.05
significant <- as.data.frame(diff_abund)
significant <- filter(significant, padj < alpha)

#Filter data based on alpha < 0.05 for family
alpha <- 0.05
significant_family <- as.data.frame(diff_abund_family)
significant_family <- filter(significant_family, padj < alpha)

#Filter data based on alpha < 0.05 for species
alpha <- 0.05
significant_species <- as.data.frame(diff_abund_species)
significant_species <- filter(significant_species, padj < alpha)

#Merging Data for genus
genera_df <- as.data.frame(tax_table(abundant_genera))
significant <- merge(significant, genera_df, by = "row.names")
significant <- arrange(significant, log2FoldChange)

dim(significant)

#Merging Data for family
family_df <- as.data.frame(tax_table(abundant_genera_family))
significant_family <- merge(significant_family, family_df, by = "row.names")
significant_family <- arrange(significant_family, log2FoldChange)

dim(significant_family)

#Merging Data for species
species_df <- as.data.frame(tax_table(abundant_genera_species))
significant_species <- merge(significant_species, species_df, by = "row.names")
significant_species <- arrange(significant_species, log2FoldChange)

dim(significant_species)

#Creating differential abundance genera plot
significant <- filter(significant, Genus != "g__")
significant <- mutate(significant,
                          Genus = factor(Genus, levels = Genus))

ggplot(significant, aes(x = log2FoldChange, y = Genus)) +
  geom_bar(stat = "identity") +
  labs(title = "Differential abundant genera - PD vs Healthy",
       x = "log2 Fold-Change",
       y = "Genus") +
  theme_bw()


#Creating differential abundance family plot
significant_family <- mutate(significant_family,
                      Family = factor(Family, levels = unique(Family)))

ggplot(significant_family, aes(x = log2FoldChange, y = Family)) +
  geom_bar(stat = "identity") +
  labs(title = "Differential abundant Family",
       x = "log2 Fold-Change",
       y = "Family") +
  theme_bw()

#Relative abundance
at_least_6000_RA <- transform_sample_counts(at_least_6000, calculate_relative_abundance)

#Getting most abundant taxa and getting relative abundance graphs
counts <- taxa_sums(at_least_6000)
relative_abundance <- calculate_relative_abundance(counts)
abundant <- relative_abundance > 0.001
abundant_RA_taxa <- prune_taxa(abundant, at_least_6000_RA)

abundant_RA_taxa <- tax_glom(abundant_RA_taxa, taxrank = "Genus")
bifidobacterium <- subset_taxa(abundant_RA_taxa, Genus == "g__Bifidobacterium")
otu_table(bifidobacterium)

bifidobacterium_long <- psmelt(bifidobacterium)
bifidobacterium_long

roseburia <- subset_taxa(abundant_RA_taxa, Genus == "g__Roseburia")
otu_table(roseburia)

roseburia_long <- psmelt(roseburia)
roseburia_long

faecalibacterium <- subset_taxa(abundant_RA_taxa, Genus == "g__Faecalibacterium")
otu_table(faecalibacterium)

faecalibacterium_long <- psmelt(faecalibacterium)
faecalibacterium_long

collinsella <- subset_taxa(abundant_RA_taxa, Genus == "g__Collinsella")
otu_table(collinsella)

collinsella_long <- psmelt(collinsella)
collinsella_long

bilophila <- subset_taxa(abundant_RA_taxa, Genus == "g__Bilophila")
otu_table(bilophila)

bilophila_long <- psmelt(bilophila)
bilophila_long

akkermansia <- subset_taxa(abundant_RA_taxa, Genus == "g__Akkermansia")
otu_table(akkermansia)

akkermansia_long <- psmelt(akkermansia)
akkermansia_long


#Relative abundance plot
ggplot(bifidobacterium_long, aes (x = Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance Bifidobacterium",
                                                                                       x = "Disease",
                                                                                       y = "Relative Abundance")
                                                                                       
ggplot(roseburia_long, aes (x = Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance Roseburia",
                                                                                       x = "Disease",
                                                                                       y = "Relative Abundance")

ggplot(faecalibacterium_long, aes (x = Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance Faecalibacterium",
                                                                                   x = "Disease",
                                                                                   y = "Relative Abundance")

ggplot(collinsella_long, aes (x = Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance Collinsella",
                                                                                 x = "Disease",
                                                                                 y = "Relative Abundance")

ggplot(bilophila_long, aes (x = Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance Bilophila",
                                                                                   x = "Disease",
                                                                                   y = "Relative Abundance")

ggplot(akkermansia_long, aes (x = Disease, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance Akkermansia",
                                                                                   x = "Disease",
                                                                                   y = "Relative Abundance")

#Creating differential abundance plot
significant <- mutate(significant,
                      Genus = factor(Genus, levels = unique(Genus)))

ggplot(significant, aes(x = log2FoldChange, y = Genus)) +
  geom_bar(stat = "identity") +
  labs(title = "Differential abundant genera",
       x = "log2 Fold-Change",
       y = "Genus") +
  theme_bw()
             

<script3.r>
# Load CRAN packages
library(tidyverse)
library(vegan)
library(ape)

# Load Bioconductor packages
library(phyloseq)
library(DESeq2)

# Calculate relative abundance
calculate_relative_abundance <- function(x) x / sum(x)

# Calculate geometric mean 
calculate_gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# Define the set of random numbers
set.seed(711)

# Import qiime files
biom_file_PD <- import_biom("PD_Abx_only.biom")
biom_file_Healthy <- import_biom("Healthy_Abx_only.biom")
metadata <- import_qiime_sample_data("Parkinsons_Metadata_Abx.tsv")
tree <- read_tree_greengenes("tree.nwk")

# IMPORTANT!!! Convert tree from multichotomous to dichotomous tree
tree <- multi2di(tree)

# Combine information into single phyloseq object
healthy_physeq <- merge_phyloseq(biom_file_Healthy, metadata, tree)
PD_physeq <- merge_phyloseq(biom_file_PD, metadata, tree)

# Change taxonomic levels from numbers to actual names
colnames(tax_table(healthy_physeq)) <- c("Kingdom", "Phylum", "Class","Order", "Family",
                                 "Genus", "Species")

colnames(tax_table(PD_physeq)) <- c("Kingdom", "Phylum", "Class","Order", "Family",
                                         "Genus", "Species")


#Setting sampling depth
sample_sums(physeq) >= 6000

#Filtering Metadata so all samples involved have at least 6000 features
healthy_6000 <- prune_samples(sample_sums(healthy_physeq) >= 6000, healthy_physeq)
sample_sums(healthy_6000)

PD_6000 <- prune_samples(sample_sums(PD_physeq) >= 6000, PD_physeq)
sample_sums(PD_6000)

#Getting most abundant taxa for Healthy patients at the genus level
healthy_counts <- taxa_sums(healthy_6000)
healthy_relative_abundance <- calculate_relative_abundance(healthy_counts)
healthy_abundant <- healthy_relative_abundance > 0.001
healthy_abundant_taxa <- prune_taxa(healthy_abundant, healthy_6000)

healthy_abundant_genera <- tax_glom(healthy_abundant_taxa, taxrank = "Genus")
healthy_abundant_genera

#Getting most abundant taxa for PD patients at the genus level
PD_counts <- taxa_sums(PD_6000)
PD_relative_abundance <- calculate_relative_abundance(PD_counts)
PD_abundant <- PD_relative_abundance > 0.001
PD_abundant_taxa <- prune_taxa(PD_abundant, PD_6000)

PD_abundant_genera <- tax_glom(PD_abundant_taxa, taxrank = "Genus")
PD_abundant_genera

#DESeq2 Analysis Healthy Patients
healthy_deseq <- phyloseq_to_deseq2(healthy_abundant_genera, ~ Abx_doses_last_five_years)
geo_means <- apply(counts(healthy_deseq), 1, calculate_gm_mean)
healthy_deseq <- estimateSizeFactors(healthy_deseq, geoMeans = geo_means)
healthy_deseq <- DESeq(healthy_deseq, fitType = "local")

healthy_diff_abund <- results(healthy_deseq)

#DESeq2 Analysis PD Patients
PD_deseq <- phyloseq_to_deseq2(PD_abundant_genera, ~ Abx_doses_last_five_years)
geo_means <- apply(counts(PD_deseq), 1, calculate_gm_mean)
PD_deseq <- estimateSizeFactors(PD_deseq, geoMeans = geo_means)
PD_deseq <- DESeq(PD_deseq, fitType = "local")

PD_diff_abund <- results(PD_deseq)

#Filter data based on alpha < 0.05 for healthy patients
alpha <- 0.05
healthy_significant <- as.data.frame(healthy_diff_abund)
healthy_significant <- filter(healthy_significant, padj < alpha)

#Filter data based on alpha < 0.05 for PD patients
alpha <- 0.05
PD_significant <- as.data.frame(PD_diff_abund)
PD_significant <- filter(PD_significant, padj < alpha)

#Merging Data PD Patients
PD_genera_df <- as.data.frame(tax_table(PD_abundant_genera))
PD_significant <- merge(PD_significant, PD_genera_df, by = "row.names")
PD_significant <- arrange(PD_significant, log2FoldChange)

dim(PD_significant)

#Creating differential abundance plot
PD_significant <- mutate(PD_significant,
                      Row.names = factor(Row.names, levels = Row.names))

ggplot(PD_significant, aes(x = log2FoldChange, y = Genus)) +
  geom_bar(stat = "identity") +
  labs(title = "Differential abundant genera",
       x = "log2 Fold-Change",
       y = "Genus") +
  theme_bw()

#Relative abundance
PD_6000_RA <- transform_sample_counts(PD_6000, calculate_relative_abundance)

#Getting most abundant taxa for PD patients at the genus level
PD_counts <- taxa_sums(PD_6000)
PD_relative_abundance <- calculate_relative_abundance(PD_counts)
PD_abundant <- PD_relative_abundance > 0.001
PD_abundant_RA_taxa <- prune_taxa(PD_abundant, PD_6000_RA)

PD_abundant_RA_taxa <- tax_glom(PD_abundant_RA_taxa, taxrank = "Genus")
bifidobacterium <- subset_taxa(PD_abundant_RA_taxa, Genus == "g__Bifidobacterium")
otu_table(bifidobacterium)

bifidobacterium_long <- psmelt(bifidobacterium)
bifidobacterium_long

#Relative abundance plot
ggplot(bifidobacterium_long, aes (x = Abx_doses_last_five_years, y = Abundance)) + geom_boxplot() + labs(title = "Relative abundance Bifidobacterium",
                                                                                                         x = "Abx",
                                                                                                         y = "Abundance")


<script4.r>
# Load CRAN packages
library(tidyverse)
library(vegan)
library(ape)

# Load Bioconductor packages
library(phyloseq)
library(DESeq2)

# Calculate relative abundance
calculate_relative_abundance <- function(x) x / sum(x)

# Calculate geometric mean 
calculate_gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# Define the set of random numbers
set.seed(711)

# Import qiime files
biom_file <- import_biom("table-with-taxonomy.biom")
metadata <- import_qiime_sample_data("Parkinsons_Metadata.tsv")
tree <- read_tree_greengenes("tree.nwk")

# IMPORTANT!!! Convert tree from multichotomous to dichotomous tree
tree <- multi2di(tree)

# Combine information into single phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)

# Change taxonomic levels from numbers to actual names
colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class","Order", "Family",
                                 "Genus", "Species")
head(tax_table(physeq))

#Setting sampling depth
sample_sums(physeq) >= 6000

#Filter samples based on sampling depth
at_least_6000 <- prune_samples(sample_sums(physeq) >= 6000, physeq)
sample_sums(at_least_6000)

#Filter for Healthy Patients
healthy <- subset_samples(at_least_6000, Disease == "Control")
healthy <- subset_samples(healthy, Coffe_drinker != "")

#Filter for PD Patients
PD <- subset_samples(at_least_6000, Disease == "PD")
PD <- subset_samples(PD, Coffe_drinker != "")

#Getting most abundant taxa for Healthy patients at the genus level
healthy_counts <- taxa_sums(healthy)
healthy_relative_abundance <- calculate_relative_abundance(healthy_counts)
healthy_abundant <- healthy_relative_abundance > 0.001
healthy_abundant_taxa <- prune_taxa(healthy_abundant, healthy)

healthy_abundant_genera <- tax_glom(healthy_abundant_taxa, taxrank = "Genus")
healthy_abundant_genera

#Getting most abundant taxa for Healthy patients at the genus level
PD_counts <- taxa_sums(PD)
PD_relative_abundance <- calculate_relative_abundance(PD_counts)
PD_abundant <- PD_relative_abundance > 0.001
PD_abundant_taxa <- prune_taxa(PD_abundant, PD)

PD_abundant_genera <- tax_glom(PD_abundant_taxa, taxrank = "Genus")
PD_abundant_genera

#DESeq2 Analysis Healthy Patients
deseq_healthy <- phyloseq_to_deseq2(healthy_abundant_genera, ~ Coffe_drinker)
geo_means <- apply(counts(deseq_healthy), 1, calculate_gm_mean)
deseq_healthy <- estimateSizeFactors(deseq_healthy, geoMeans = geo_means)
deseq_healthy <- DESeq(deseq_healthy, fitType = "local")

healthy_diff_abund <- results(deseq_healthy)

#DESeq2 Analysis PD Patients
deseq_PD <- phyloseq_to_deseq2(PD_abundant_genera, ~ Coffe_drinker)
geo_means <- apply(counts(deseq_PD), 1, calculate_gm_mean)
deseq_PD <- estimateSizeFactors(deseq_PD, geoMeans = geo_means)
deseq_PD <- DESeq(deseq_PD, fitType = "local")

PD_diff_abund <- results(deseq_PD)

#Filter data based on alpha < 0.05 for Healthy
alpha <- 0.05
healthy_significant <- as.data.frame(healthy_diff_abund)
healthy_significant <- filter(healthy_significant, padj < alpha)

#Filter data based on alpha < 0.05 for PD
alpha <- 0.05
PD_significant <- as.data.frame(PD_diff_abund)
PD_significant <- filter(PD_significant, padj < alpha)