#### Load packages ####
library(tidyverse)
library(phyloseq)
library(vegan)
library(ape)

#### Upload files ####
metafp <- "parkinsons_metadata.txt"
meta <- read_delim(metafp, delim="\t")
otufp <- "feature-table_new.txt"
otu <- read_delim(file = otufp, delim="\t", skip=1)
taxfp <- "taxonomy.tsv"
tax <- read_delim(taxfp, delim="\t")
phylotree <- "rooted_tree.nwk"
phylotree <- read.tree(phylotree)

#### Format data ahead of creating Phyloseq object ####
## Format Feature Table and Metadata ahead of generating a Phyloseq object
otu_mat <- as.matrix(otu[,-1])
rownames(otu_mat) <- otu$`#OTU ID`
OTU <- otu_table(otu_mat, taxa_are_rows = T) 

samp_df <- as.data.frame(meta[,-1])
rownames(samp_df)<- meta$`#SampleID`

## Conversion of 1-0 numerical values to Yes-No
samp_df$entacapone<-ifelse(samp_df$entacapone==1,"Yes","No")
samp_df$pramipexole<-ifelse(samp_df$pramipexole==1,"Yes","No")
samp_df$rasagiline<-ifelse(samp_df$rasagiline==1,"Yes","No")
samp_df$amantadine<-ifelse(samp_df$amantadine==1,"Yes","No")
samp_df$IBS<-ifelse(samp_df$IBS==1,"Yes","No")
samp_df$Laxative_use<-ifelse(samp_df$Laxative_use==1,"Yes","No")
samp_df$Constipation_any<-ifelse(samp_df$Constipation_any==1,"Yes","No")
samp_df$Depressed<-ifelse(samp_df$Depressed==1,"Yes","No")
samp_df$BDIOver5<-ifelse(samp_df$BDIOver5==1,"Yes","No")

# Define new categorical variable for Bristol Score 
samp_df <- within(samp_df, {   
  Bristol.cat <- NA # need to initialize variable
  Bristol.cat[Bristol < 3] <- "Constipated"
  Bristol.cat[Bristol >= 3 & Bristol < 5] <- "Normal"
  Bristol.cat[Bristol >= 5] <- "Diarrheal"
} )

SAMP <- sample_data(samp_df)

# Reformat taxa table; already split by domain, phylum, class etc, 
# but includes confd. values
taxa <- tax %>% 
  separate(col=`Feature ID Taxon Confidence`, sep=" "
           , into = c("Feature ID","Domain"))

taxa <- taxa %>% 
  separate(col=`...5`, sep=" "
           , into = c("...5","ConsensusFamily")) %>%
  separate(col=`...6`, sep=" "
           , into = c("...6","ConsensusGenus")) %>%
  separate(col=`...7`, sep=" "
           , into = c("...7","ConsensusSpecies"))

taxa <- taxa %>% 
  select(-ConsensusFamily, -ConsensusGenus, -ConsensusSpecies)

# Rename column names from the taxa table
colnames(taxa)[3] = "Phylum"
colnames(taxa)[4] = "Class"
colnames(taxa)[5] = "Order"
colnames(taxa)[6] = "Family"
colnames(taxa)[7] = "Genus"
colnames(taxa)[8] ="Species"

taxa <- filter(taxa,  
               Domain == "d__Bacteria" & 
                 Class!="c__Chloroplast" & Family !="f__Mitochondria")

# Reformat taxa table
taxa_matrix <- as.matrix(taxa[,-1])
rownames(taxa_matrix) <- taxa$`Feature ID`
TAX <- tax_table(taxa_matrix)

#### Make Phyloseq object ####
PARKINSONS <- phyloseq(OTU, SAMP, TAX, tree=phylotree)
ALL_DATA <- data.frame(sample_data(PARKINSONS))
nrow(ALL_DATA)

# Filter to include only those with PD; we are interested in possible confounders
# among the PD group only
DISEASE_ONLY <- subset_samples(PARKINSONS, Disease == "PD")
DISEASE_ONLY
DISEASE_RARE <- rarefy_even_depth(DISEASE_ONLY, rngseed = 1, sample.size = 3797)

#### Need to remove NA values before proceeding with alpha and beta diversity analyses ####
library(microViz)
PD_INCRM <- ps_drop_incomplete(DISEASE_RARE, vars = "levo_equiv_dose", verbose = TRUE)
PD_INCRM <- ps_drop_incomplete(DISEASE_RARE, vars = "entacapone", verbose = TRUE)

DISEASE_DATA <- data.frame(sample_data(DISEASE_RARE))
DISEASE_RARE_DATA <- data.frame(sample_data(PD_INCRM))
DISEASE_ONLY_DATA <- data.frame(sample_data(DISEASE_ONLY))

#### Subset data for core microbiome analysis and indicator species analysis
# Isolate L-dopa-treated patients only
LEVO_ONLY <- subset_samples(PD_INCRM, entacapone == "No" & amantadine == "No" & rasagiline == "No"
                            & pramipexole == "No")

## Include only PD_L and PD_LE patients
E_and_L_ONLY <- subset_samples(PD_INCRM, amantadine == "No" & rasagiline == "No" &
                                 pramipexole == "No")
E_and_L_ONLY_DATA <- data.frame(sample_data(E_and_L_ONLY))

## Include either patients that take laxatives or do not 
E_and_L_ONLY_LAX <- subset_samples(E_and_L_ONLY, Laxative_use == "Yes")
E_and_L_ONLY_NOLAX <- subset_samples(E_and_L_ONLY, Laxative_use == "No")

#### Indicator Species Analysis ####
library(indicspecies)

# First, we will look at PD patients taking laxatives only. 
ELAX_ONLY <- subset_samples(E_and_L_ONLY_LAX, entacapone == "Yes")
LAX_ONLY <- subset_samples(LEVO_ONLY, Laxative_use=="Yes")
LAX_EL <- merge_phyloseq(ELAX_ONLY, LAX_ONLY)

# Glom by genus. 
LAX_EL_GLOM <- tax_glom(LAX_EL, "Genus", NArm = FALSE)
LAX_EL_RA <- transform_sample_counts(LAX_EL_GLOM, fun=function(x) x/sum(x))

# Perform indicator species analysis. Determine indicator taxa among patients taking entacapone and those that do not. 
isa_LAXEL <- multipatt(t(otu_table(LAX_EL_RA)), 
                       cluster = sample_data(LAX_EL_RA)$entacapone)
summary(isa_LAXEL)
taxtable <- tax_table(LAX_EL) %>% as.data.frame() %>% rownames_to_column(var="ASV")

# View results
results_isa_LAXEL <- isa_LAXEL$sign %>%
  rownames_to_column(var="ASV") %>%
  left_join(taxtable) %>%
  filter(p.value<0.05) %>% View() 

# Next, we will look at PD patients that do not take laxatives. 
ENOLAX_ONLY <- subset_samples(E_and_L_ONLY_NOLAX, entacapone == "Yes")
NOLAX_ONLY <- subset_samples(LEVO_ONLY, Laxative_use=="No")
NOLAX_EL <- merge_phyloseq(ENOLAX_ONLY, NOLAX_ONLY)

# Glom by genus
NOLAX_EL_GLOM <- tax_glom(NOLAX_EL, "Genus", NArm = FALSE)
NOLAX_EL_RA <- transform_sample_counts(NOLAX_EL_GLOM, fun=function(x) x/sum(x))

# Perform indicator species analysis. Determine indicator taxa among patients taking entacapone and those that do not. 
isa_NOLAXEL <- multipatt(t(otu_table(NOLAX_EL_RA)), 
                         cluster = sample_data(NOLAX_EL_RA)$entacapone)
summary(isa_NOLAXEL)
taxtable <- tax_table(NOLAX_EL) %>% as.data.frame() %>% rownames_to_column(var="ASV")

# View results 
results_isa_NOLAXEL <- isa_NOLAXEL$sign %>%
  rownames_to_column(var="ASV") %>%
  left_join(taxtable) %>%
  filter(p.value<0.05) %>% View() 

#### Core microbiome analysis ####
library(microbiome)
library(ggVennDiagram)

## Subset Phyloseq and modify objects
LEVO_LAX <- subset_samples(LEVO_ONLY, Laxative_use=="Yes") # PD_L only, Lax
LEVO_NOLAX <- subset_samples(LEVO_ONLY, Laxative_use=="No") # PD_L only, no lax
ELAX_ONLY <- subset_samples(E_and_L_ONLY_LAX, entacapone == "Yes") # PD_LE only, lax
ENOLAX_ONLY <- subset_samples(E_and_L_ONLY_NOLAX, entacapone == "Yes") # PD_LE only, nolax

# Convert to relative abundance
LEVOLAX_RA <- transform_sample_counts(LEVO_LAX, fun=function(x) x/sum(x))
LEVONOLAX_RA <- transform_sample_counts(LEVO_NOLAX, fun=function(x) x/sum(x))
ELAX_RA <- transform_sample_counts(ELAX_ONLY, fun=function(x) x/sum(x))
ENOLAX_RA <- transform_sample_counts(ENOLAX_ONLY, fun=function(x) x/sum(x))

# Set a prevalence threshold and abundance threshold:
# What ASVs are found in more than 80% of samples in each category?
LEVOLAX_ASVs <- core_members(LEVOLAX_RA, detection=0.001, prevalence = 0.25)
LEVONOLAX_ASVs <- core_members(LEVONOLAX_RA, detection=0.001, prevalence = 0.25)
ELAX_ASVs <- core_members(ELAX_RA, detection=0.001, prevalence = 0.25)
ENOLAX_ASVs <- core_members(ENOLAX_RA, detection=0.001, prevalence = 0.25)

list(LevoLax = LEVOLAX_ASVs, LevoNoLax = LEVONOLAX_ASVs, ELax = ELAX_ASVs, ENoLax = ENOLAX_ASVs)

# Make a Venn-diagram
ggVennDiagram(x=list(LevoLax = LEVOLAX_ASVs, LevoNoLax = LEVONOLAX_ASVs, 
                     ELax = ELAX_ASVs, ENoLax = ENOLAX_ASVs),
              category.names = c("L, laxatives use",
                                 "L, no laxatives",
                                 "LE, laxative use",
                                 "LE, no laxatives"),
              color = "black") + 
  scale_fill_gradient(low = "#F4FAFE", high = "#073025")