# Increased body mass is associated with decreased gut microbiome diversity in Parkinson's Disease patients

# Novia Chan, Clarisse Echavez, Angela Mathews, Josie Setiawan
# Department of Microbiology and Immunology, University of British Columbia, Vancouver, British Columbia, Canada

## Supplemental R Script

# Open a new R session
# Set working directory as to where the .qza files and metadata .tsv files are contained

## Alpha diversity metrics

# Load CRAN packages
library(tidyverse)
#Loading special packages
library(qiime2R)

# Import metadata
sorted_parkinsons_metadata <- read_tsv('parkinsons_metadata_sorted_grouped.tsv')

## Faith's phylogenetic diversity

# Import faith .qza file
faith_pd <- read_qza("faith_pd_vector.qza")
faith <- faith_pd$data %>% rownames_to_column("SampleID")
faith_pd_metadata <- faith %>%
  left_join(sorted_parkinsons_metadata, by = c("SampleID" = "#SampleID"))

# Plot Faith's phylogenetic diversity
ggplot(faith_pd_metadata,aes(x = BMI_grouping, y = faith_pd)) + 
  stat_boxplot(geom = "errorbar", width = 0.2) +
  geom_boxplot() + 
  scale_y_continuous(breaks = seq(10, 26, by = 2)) +
  scale_x_discrete(limits=c("healthy","overweight","obese")) +
  theme_classic() +  
  theme(strip.background =element_rect(fill="white", linetype = "blank"),
        axis.text.x = element_text(size = 12), axis.title.x = element_text(face="bold", size = 16),
        axis.text.y = element_text(size = 12), axis.title.y = element_text(face="bold", size = 16)) +
  labs(x = "BMI grouping", y = "Faith's phylogenetic diversity",)

## Pielou's evenness

# Import evenness .qza file
evenness_pd <- read_qza("evenness_vector.qza")
evenness <- evenness_pd$data %>% rownames_to_column("SampleID")
evenness_pd_metadata <- evenness %>%
  left_join(sorted_parkinsons_metadata, by = c("SampleID" = "#SampleID"))

# Plot Pielou's evenness
ggplot(evenness_pd_metadata,aes(x = BMI_grouping, y = pielou_evenness)) + 
  stat_boxplot(geom = "errorbar", width = 0.2) +
  geom_boxplot() + 
  scale_x_discrete(limits=c("healthy", "overweight", "obese")) +
  theme_classic() +  
  theme(strip.background =element_rect(fill="white", linetype = "blank"),
        axis.text.x = element_text(size = 12), axis.title.x = element_text(face="bold", size = 16),
        axis.text.y = element_text(size = 12), axis.title.y = element_text(face="bold", size = 16)) +
  labs(x = "BMI grouping", y = "Pielou's Evenness",)


# Open a new R session
# Set working directory as to where the .qza files and metadata .tsv files are contained

## Differential Abundance

# Load CRAN packages
library(vegan)
library(ape)
library(tidyverse)

# Load Bioconductor packages
library(phyloseq)
library(DESeq2)

#Loading special packages
library(qiime2R)

# Import the sorted Parkinson's metadata into R
p_metadata <- read_tsv("parkinsons_metadata_sorted_grouped.tsv")

# Subset metadata to only include healthy and obese BMI groupings
PD_metadata <- p_metadata[(p_metadata$BMI_grouping=="healthy") | (p_metadata$BMI_grouping=="obese"),] %>% subset(Disease=="PD")

# Remove the "#SampleID" header for the first column of the metadata to prepare for creation of a phyloseq object
PD_metadata <- PD_metadata %>%
  column_to_rownames(var = "#SampleID")

# Preparing.qza files and metadata and creating a phyloseq object
sam_data = sample_data(PD_metadata)
features_phyloseq <- qza_to_phyloseq(features = "parkinsons-table.qza")
tree_phyloseq <- qza_to_phyloseq(tree = "rooted-tree.qza")
taxonomy_phyloseq <- qza_to_phyloseq(taxonomy = "parkinsons-taxonomy.qza")
phyloseq_object_all = phyloseq(sample_data(sam_data), features_phyloseq, tree_phyloseq, taxonomy_phyloseq)

# Access the otu_table(), sample_data(), and tax_table() phyloseq object components
taxa_table <- phyloseq_object_all@otu_table
taxonomy <- phyloseq_object_all@tax_table
metadata <- phyloseq_object_all@sam_data

# Convert the phylogenetic tree from multichotomous to dichotomous format
phy_tree <- multi2di(phy_tree(tree_phyloseq))

# Review the samples for the number of reads
sample_sums(phyloseq_object_all)

# Rarefy dataset to 10232 reads per sample based on QIIME2 analysis
physeq_rar <- rarefy_even_depth(phyloseq_object_all, sample.size = 10232)

# Load the relative abundance function
calculate_relative_abundance <- function(x) x / sum(x)

# Calculate relative abundance
physeq_RA <- transform_sample_counts(phyloseq_object_all, calculate_relative_abundance)

# Remove low-abundant features
total_counts <- taxa_sums(phyloseq_object_all)
relative_abundance <- calculate_relative_abundance(total_counts)

# Filter out sequences that have a higher relative abundance than a threshold ratio of 0.0005
abundant <- relative_abundance > 0.0005 
(abundant_taxa <- prune_taxa(abundant, phyloseq_object_all))

# Set taxonomic levels for analysis
(family <- tax_glom(phyloseq_object_all, taxrank = "Family", NArm = FALSE))

# DESeq Object Creation with the categorical variable of BMI grouping
deseq_BMI_grouping <- phyloseq_to_deseq2(family, ~ BMI_grouping)
# Set the healthy BMI grouping as the reference
deseq_BMI_grouping$BMI_grouping <- relevel(deseq_BMI_grouping$BMI_grouping, ref = "healthy")

# Load the geometric mean function
calculate_gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# Calculate geometric means
geo_means <- apply(counts(deseq_BMI_grouping), 1, calculate_gm_mean)
deseq_BMI_grouping <- estimateSizeFactors(deseq_BMI_grouping, geoMeans = geo_means)

deseq_BMI_grouping <- DESeq(deseq_BMI_grouping, fitType = "local")

# View the performed comparisons from DESeq2
resultsNames(deseq_BMI_grouping)

# Extracting differentially abundant microbes with the obese vs healthy BMI grouping comparison
BMI_grouping_diff_abund <- results(deseq_BMI_grouping, name = "BMI_grouping_obese_vs_healthy")

# Define the alpha level to determine significant changes
alpha <- 0.05

# Convert results from DESeq to a data frame for further processing and filter any taxa in which the FDR-corrected p-value is below the alpha level
(significant_BMI_grouping <- BMI_grouping_diff_abund %>% 
    as.data.frame() %>% 
    filter(padj < alpha))

# Merge table with significant results with the table of taxonomic information according to "row.names" and sort data frame by log2FoldChange from lowest to highest
family_df <- as.data.frame(tax_table(family))
(significant_BMI_grouping <- significant_BMI_grouping %>% 
    merge(family_df, by = "row.names") %>% 
    arrange(log2FoldChange))

# Visualize the differential abundance family data
ggplot(significant_BMI_grouping, aes(x = log2FoldChange, y = Family)) +
  geom_bar(stat = "identity") +
  labs(title = "Differential abundant family",
       x = expression(log[2]~fold~change),
       y = "Family") +
  theme_bw(base_size = 30)


# Open a new R session
# Set working directory as to where the .qza files and metadata .tsv files are contained

## Indicator taxa analysis

# Load necessary packages
library(tidyverse)
library(phyloseq)
library(indicspecies)
library(qiime2R)

# Function to group asv table by higher order taxonomy
group_by_taxonomy = function(asv_table, taxonomy, rank){
  asv_table = as.data.frame(asv_table)
  taxonomy = as.data.frame(taxonomy)
  taxonomy$ASV = rownames(taxonomy)
  asv_table$ASV = rownames(asv_table)
  asv_table = inner_join(taxonomy,asv_table,by="ASV")
  asv_table$taxa = apply(asv_table[,1:rank],1,paste,collapse=" ")
  asv_table = asv_table[,-(1:8)]
  asv_table = group_by(data.frame(asv_table),taxa)
  taxa_table = as.data.frame(summarise_all(asv_table,sum))
  rownames(taxa_table) = taxa_table$taxa
  return(taxa_table[,-1])
}

# Import the sorted Parkinson's metadata into R
p_metadata <- read_tsv("parkinsons_metadata_sorted_grouped.tsv")

# Subset metadata to only include healthy, overweight, and obese BMI groupings
PD_metadata <- p_metadata[(p_metadata$BMI_grouping=="healthy") | (p_metadata$BMI_grouping=="overweight") | (p_metadata$BMI_grouping=="obese"),] %>% subset(Disease=="PD")

# Remove the "#SampleID" header for the first column of the metadata to prepare for creation of a phyloseq object
PD_metadata <- PD_metadata %>%
  column_to_rownames(var = "#SampleID")

# Preparing.qza files and metadata and creating a phyloseq object
sam_data = sample_data(PD_metadata)
features_phyloseq <- qza_to_phyloseq(features = "parkinsons-table.qza")
tree_phyloseq <- qza_to_phyloseq(tree = "rooted-tree.qza")
taxonomy_phyloseq <- qza_to_phyloseq(taxonomy = "parkinsons-taxonomy.qza")
phyloseq_object_all = phyloseq(sample_data(sam_data), features_phyloseq, tree_phyloseq, taxonomy_phyloseq)

# Access the otu_table(), sample_data(), and tax_table() phyloseq object components
taxa_table <- phyloseq_object_all@otu_table
taxonomy <- phyloseq_object_all@tax_table
metadata <- phyloseq_object_all@sam_data

# Create the taxa table for the rank of taxonomy required
# Kingdom=1, Phylum=2, Class=3, Order=4, Family=5, Genus=6, Species=7
taxa_table = group_by_taxonomy(taxa_table, taxonomy, 5)

# Calculate indicator values
indicator_multipatt = multipatt(t(taxa_table),metadata$BMI_grouping,duleg=T)

# Look at output
summary(indicator_multipatt)

# Write output to file (only writes significant indicators)
indicator_output = capture.output(summary(indicator_multipatt,indvalcomp=TRUE))
write.table(indicator_output,file="indiciator_values_family_healthyoverweightobese_PD.txt",row.names=F,quote=F)