R Script PART Checklist 

# Parsa March 13 2022

#Metadata Modifications in R 

#loading packages 
library(tidyverse)
library(dplyr)
library(stringr)
library(readxl)
library(janitor)


#importing dataset
original <- read_excel("C:/Users/Parsa Abrishamkar/Desktop/447_project_2/parkinsons_metadata.xlsx")


#selecting columns of interest
metadata <- original %>% select(`#SampleID` ,Disease, FSS_fatigue_score, Sleep_problems, 
                                Antidepressant_use, Apathy_score)


#converting Disease into a factor
metadata <- metadata %>% mutate(Disease = as_factor(Disease))

#Categorizing fatigue score into binary groups while retaining NA values
metadata <- metadata %>% mutate(fatigue_binary = 
                                case_when(FSS_fatigue_score >= 4 ~ "yes", 
                                          FSS_fatigue_score < 4 ~ "no", 
                                          TRUE ~ "NA"))

#Categorizing apathy score into binary groups while retaining NA values
metadata <- metadata %>% mutate(apathy_binary = 
                                case_when(Apathy_score >= 14 ~ "yes", 
                                          Apathy_score < 14 ~ "no", 
                                          TRUE ~ "NA"))

#removing non-binary fatigue and apathy columns 
metadata <- metadata %>% select(`#SampleID`, Disease, fatigue_binary, Sleep_problems, 
                                Antidepressant_use, apathy_binary)


#converting NA values into strings in Sleep_problems and 
#Antidepressant_use columns into string in order to be consistent with 
#the fatigue_binary and apathy_binary columns

metadata <- metadata %>% mutate(sleep_problems = 
                                  case_when(Sleep_problems == "Yes" ~ "yes", 
                                            Sleep_problems == "No" ~ "no", 
                                            TRUE ~ "NA"), 
                                antidepressant_use = 
                                  case_when(Antidepressant_use == "Yes" ~ "yes", 
                                            Antidepressant_use == "No" ~ "no", 
                                            TRUE ~ "NA"))

#removing the original Sleep_problems and Antidepressant_use columns
metadata <- metadata %>% select(`#SampleID`, Disease, fatigue_binary, sleep_problems, 
                                antidepressant_use, apathy_binary)


#exporting metadata as a tsv file
write_tsv(metadata, path = "metadata.tsv")


------------------------------------------------------------------------------------------------------------------------------------------------------


#Parsa 28 Mar 2022 (5:25 PM)


#Differential abundance analysis


# Load CRAN packages
library(tidyverse)
library(vegan)
library(ape)

# Load Bioconductor packages
library(phyloseq)
library(DESeq2)

# Calculate relative abundance
calculate_relative_abundance <- function(x) x / sum(x)

# Calculate geometric mean 
calculate_gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}


#setting random numbers
set.seed(711)


# Export biom file and tree from QIIME2 and provide original metadata file
biom_file <- import_biom("table-with-taxonomy.biom")
metadata  <- import_qiime_sample_data("metadata.tsv")
tree      <- read_tree_greengenes("tree.nwk")


# Convert from multichotomous to dichotmous tree
tree <- multi2di(tree)

# Combine all information into a single phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)


#calling physeq object to see if number of ASVs and samples match with table.qzv values
physeq

#converting taxonomic ranks from numbers to proper names
colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class","Order", "Family",
                                 "Genus", "Species")
                                 
#viewing sequencing depths
sample_sums(physeq)

#checking if any sample has sequencing depth of less than 6000
sample_sums(physeq) >= 6000

#there are a few samples with less than 6000 sequencing depth
#we will eliminate these samples
at_least_6000 <- prune_samples(sample_sums(physeq) >= 6000, physeq)
sample_sums(at_least_6000)


#subsetting the metadata to only include PD patients that do not have NA
#for antidepressant_use
PD_antidep_metadata <- subset_samples(physeq, 
                                      Disease == "PD" & antidepressant_use != "NA") 
                                      
                                      
#remove low abundance features at 0.0005 threshold
PD_antidep_counts <- taxa_sums(PD_antidep_metadata)
relative_abundance_PD_antidep <- calculate_relative_abundance(PD_antidep_counts)

abundant_PD_antidep <- relative_abundance_PD_antidep > 0.0005 
abundant_PD_antidep_taxa <- prune_taxa(abundant_PD_antidep, PD_antidep_metadata)

#setting taxnomic level to genus
abundant_PD_antidep_genera <- tax_glom(abundant_PD_antidep_taxa, 
                                       taxrank = "Genus")
abundant_PD_antidep_genera


#converting antidepressant_use values into factors and setting
# No antidepressant use as the reference group
sample_data(abundant_PD_antidep_genera)$antidepressant_use <-
  factor(sample_data(abundant_PD_antidep_genera)$antidepressant_use,
         levels = c("no", "yes"))
         
         
#creating DESeq2 object
deseq_PD_antidep <- phyloseq_to_deseq2(abundant_PD_antidep_genera, ~ antidepressant_use)
geo_means <- apply(counts(deseq_PD_antidep), 1, calculate_gm_mean)
deseq_PD_antidep <- estimateSizeFactors(deseq_PD_antidep, geoMeans = geo_means)
deseq_PD_antidep <- DESeq(deseq_PD_antidep, fitType = "local")

PD_antidep_diff_abund <- results(deseq_PD_antidep)


#setting alpha level at 0.05 and converting DEseq results to a data frame
# and filtering for significantly different taxa
alpha <- 0.05
significant_PD_antidep <- as.data.frame(PD_antidep_diff_abund)
significant_PD_antidep <- filter(significant_PD_antidep, padj < alpha)


#merging table with significant results with table of taxonomic information
# arranging log2 fold change results in descending order 
genera_df <- as.data.frame(tax_table(abundant_PD_antidep_genera))
significant_PD_antidep <- merge(significant_PD_antidep, genera_df, by = "row.names")
significant_PD_antidep <- arrange(significant_PD_antidep, log2FoldChange)

dim(significant_PD_antidep)

significant_PD_antidep


--------------------------------------------------------------------------------------------------------------------------------------------------------
#Parsa: 01 Apr 2022 (3:50 PM)

#Relative abundance analysis for PD subjects based on antidepressant use


# Load CRAN packages
library(tidyverse)
library(vegan)
library(ape)

# Load Bioconductor packages
library(phyloseq)
library(DESeq2)


# Calculate relative abundance
calculate_relative_abundance <- function(x) x / sum(x)

# Calculate geometric mean 
calculate_gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}


#setting random numbers
set.seed(711)

# Export biom file and tree from QIIME2 and provide original metadata file
biom_file <- import_biom("table-with-taxonomy.biom")
metadata  <- import_qiime_sample_data("metadata.tsv")
tree      <- read_tree_greengenes("tree.nwk")


# Convert from multichotomous to dichotmous tree
tree <- multi2di(tree)

# Combine all information into a single phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)


#calling physeq object to see if number of ASVs and samples match with table.qzv values
physeq

#converting taxonomic ranks from numbers to proper names
colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class","Order", "Family",
                                 "Genus", "Species")


#viewing sequencing depths
sample_sums(physeq)

#checking if any sample has sequencing depth of less than 6000
sample_sums(physeq) >= 6000

#there are a few samples with less than 6000 sequencing depth
#we will eliminate these samples
at_least_6000 <- prune_samples(sample_sums(physeq) >= 6000, physeq)
sample_sums(at_least_6000)

#sub-setting the metadata to only include PD patients that do not have NA
#for antidepressant_use
PD_antidep_metadata <- subset_samples(physeq, 
                                      Disease == "PD" & antidepressant_use != "NA")


#calculate relative abundance
PD_RA <- transform_sample_counts(PD_antidep_metadata, calculate_relative_abundance)


#remove low abundance features at 0.0005 threshold
PD_antidep_counts <- taxa_sums(PD_antidep_metadata)
relative_abundance_PD_antidep <- calculate_relative_abundance(PD_antidep_counts)

abundant_PD_antidep <- relative_abundance_PD_antidep > 0.0005 
abundant_PD_antidep_RA_taxa <- prune_taxa(abundant_PD_antidep, PD_RA)

#setting taxonomic level to phylum 
abundant_PD_antidep_phyla <- tax_glom(abundant_PD_antidep_RA_taxa, 
                                       taxrank = "Phylum")


#transforming data frame from wide to long form
abundant_PD_antidep_phyla_long <- psmelt(abundant_PD_antidep_phyla)

#creating relative abundance plot for PD patients based on 
#their antidepressant use and grouped by respective their microbiota phyla 

#step 1: adding new column to the dataset that removed the p__ from the names in the Phylum column 
abundant_PD_antidep_phyla_long <- abundant_PD_antidep_phyla_long %>% 
  mutate(phylum_fixed = case_when(Phylum == "p__Actinobacteriota" ~ "Actinobacteriota", 
                                  Phylum == "p__Bacteroidota" ~ "Bacteroidota", 
                                  Phylum == "p__Cyanobacteria" ~ "Cyanobacteria", 
                                  Phylum == "p__Desulfobacterota" ~ "Desulfobacterota", 
                                  Phylum == "p__Firmicutes" ~ "Firmicutes", 
                                  Phylum == "p__Proteobacteria" ~ "Proteobacteria", 
                                  Phylum == "p__Verrucomicrobiota" ~ "Verrucomicrobiota"))


#step 2: color-blind friendly palette
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


#step 3: creating the plot
relative_abundance_PD_plot <- ggplot(abundant_PD_antidep_phyla_long, 
                                  aes(x = antidepressant_use, 
                                      y = Abundance)) + 
  geom_col(aes(fill=phylum_fixed)) + 
  labs(title = "Relative Abundance for PD Subjects Based on Antidepressant Use", 
      x = "Antidepressant Use", y = "Relative Abundance") + 
  theme_bw() + scale_fill_manual(values=cbPalette) + 
  guides(fill=guide_legend(title="Phylum"))

relative_abundance_PD_plot

--------------------------------------------------------------------------------------------------------------------------------------------------------------
#Parsa: 01 Apr 2022 (6:06 PM)

#Relative abundance analysis for control subjects based on antidepressant use

# Load CRAN packages
library(tidyverse)
library(vegan)
library(ape)

# Load Bioconductor packages
library(phyloseq)
library(DESeq2)


# Calculate relative abundance
calculate_relative_abundance <- function(x) x / sum(x)

# Calculate geometric mean 
calculate_gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}


#setting random numbers
set.seed(711)

# Export biom file and tree from QIIME2 and provide original metadata file
biom_file <- import_biom("table-with-taxonomy.biom")
metadata  <- import_qiime_sample_data("metadata.tsv")
tree      <- read_tree_greengenes("tree.nwk")


# Convert from multichotomous to dichotmous tree
tree <- multi2di(tree)

# Combine all information into a single phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)


#calling physeq object to see if number of ASVs and samples match with table.qzv values
physeq

#converting taxonomic ranks from numbers to proper names
colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class","Order", "Family",
                                 "Genus", "Species")


#viewing sequencing depths
sample_sums(physeq)

#checking if any sample has sequencing depth of less than 6000
sample_sums(physeq) >= 6000

#there are a few samples with less than 6000 sequencing depth
#we will eliminate these samples
at_least_6000 <- prune_samples(sample_sums(physeq) >= 6000, physeq)
sample_sums(at_least_6000)

#sub-setting the metadata to only include control patients that do not have NA
#for antidepressant_use
control_antidep_metadata <- subset_samples(physeq, 
                                      Disease == "Control" & antidepressant_use != "NA")

#calculate relative abundance
control_RA <- transform_sample_counts(control_antidep_metadata, calculate_relative_abundance)


#remove low abundance features at 0.0005 threshold
control_antidep_counts <- taxa_sums(control_antidep_metadata)
relative_abundance_control_antidep <- calculate_relative_abundance(control_antidep_counts)

abundant_control_antidep <- relative_abundance_control_antidep > 0.0005 
abundant_control_antidep_RA_taxa <- prune_taxa(abundant_control_antidep, control_RA)

#setting taxonomic level to phylum 
abundant_control_antidep_phyla <- tax_glom(abundant_control_antidep_RA_taxa, 
                                      taxrank = "Phylum")


#transforming data frame from wide to long form
abundant_control_antidep_phyla_long <- psmelt(abundant_control_antidep_phyla)

#creating relative abundance plot for PD patients based on 
#their antidepressant use and grouped by respective their microbiota phyla 

#step 1: adding new column to the dataset that removed the p__ from the names in the Phylum column 
abundant_control_antidep_phyla_long <- abundant_control_antidep_phyla_long %>% 
  mutate(phylum_fixed = case_when(Phylum == "p__Actinobacteriota" ~ "Actinobacteriota", 
                                  Phylum == "p__Bacteroidota" ~ "Bacteroidota", 
                                  Phylum == "p__Cyanobacteria" ~ "Cyanobacteria", 
                                  Phylum == "p__Desulfobacterota" ~ "Desulfobacterota", 
                                  Phylum == "p__Firmicutes" ~ "Firmicutes", 
                                  Phylum == "p__Proteobacteria" ~ "Proteobacteria", 
                                  Phylum == "p__Verrucomicrobiota" ~ "Verrucomicrobiota", 
                                  Phylum == "p__Spirochaetota" ~ "Spirochaetota"))

#step 2: color-blind friendly palette
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


#step 3: creating the plot
relative_abundance_control_plot <- ggplot(abundant_control_antidep_phyla_long, 
                                  aes(x = antidepressant_use, 
                                      y = Abundance)) + 
  geom_col(aes(fill=phylum_fixed)) + 
  labs(title = "Relative Abundance for Control Subjects Based on Antidepressant Use", 
       x = "Antidepressant Use", y = "Relative Abundance") + 
  theme_bw() + scale_fill_manual(values=cbPalette) + 
  guides(fill=guide_legend(title="Phylum")) + 
  expand_limits(y=c(0, 70))

relative_abundance_control_plot

--------------------------------------------------------------------------------------------------------------------------------------------------------------
#Parsa: 03 Apr 2022 (11:58 PM)

##relative abundance analysis for PD subjects based on antidepressant use using microeco 

#Install CRAN packages
install.packages("microeco")
install.packages("file2meco")
install.packages("picante")
install.packages("GUniFrac")
install.packages("ggalluvial")
install.packages("ggh4x")
install.packages("ggpubr")
install.packages("randomForest")
install.packages("igraph")
install.packages("rgexf")
install.packages("htmlwidgets")


# Load CRAN packages
library(here)
library(tidyverse)
library(microeco)
library(cowplot)
library(file2meco)
library(picante)
library(GUniFrac)
library(ggalluvial)
library(ggh4x)
library(ggpubr)
library(randomForest)
library(igraph)
library(rgexf)
library(htmlwidgets)


# set.seed is used to fix the random number generation to make the results repeatable
set.seed(711)
theme_set(theme_bw(base_size=18))
calculate_relative_abundance <- function(x) x/sum(x)


#importing qza artifacts to microeco 
(meco_data<-qiime2meco(
  ASV_data=here::here("table.qza"),
  phylo_tree=here::here("rooted-tree.qza"),
  taxonomy_data=here::here("taxonomy.qza"),
  sample_data = here::here("meco-metadata.txt")))


#Create a dummy copy of our dataset
(dataset <- clone(meco_data))

#Define our minimum sequencing depth of 6000
min_seq_depth <- 6000
#Check whether all samples have more reads than the minimum threshold
all(dataset$sample_sums() > min_seq_depth)

#Use select_if to select only columns that evaluate to TRUE
dataset$otu_table <- dataset$otu_table %>% select_if(dataset$sample_sums() > min_seq_depth)

#Use $tidy_dataset() to propagate changes everywhere and make sure all components are aligned
dataset$tidy_dataset()

#Filter the samples to only include PD patients that do not have NA for antidepressant use
#Use dplyr to do that
dataset$sample_table <- dataset$sample_table %>% filter(`Disease` == "PD", antidepressant_use != "NA")
#tidy the whole microtable
dataset$tidy_dataset()


#Calculate relative abundance
dataset$cal_abund(rel=T)


#removing low abundant features
PD_meco_total_counts <- dataset$taxa_sums()
PD_meco_relative_abundance <- calculate_relative_abundance(PD_meco_total_counts)


PD_meco_abundant <- PD_meco_relative_abundance > 0.0005 
#Filter out the features that don't pass the check
dataset$otu_table <- dataset$otu_table [PD_meco_abundant,]
#Propagate the changes across the dataset
dataset$tidy_dataset()

#setting taxonomic level to phylum

#extract relative abundance information on phyla
PD_meco_phyla_RA <- dataset$taxa_abund$Phylum %>% as_tibble(rownames = "Feature_Taxonomy")

#turning table from wide to long form
PD_meco_phyla_RA <- PD_meco_phyla_RA %>% pivot_longer(-Feature_Taxonomy, values_to = "Relative_Abundance", names_to = "SampleID")
#Now we can join this with the metadata and plot
PD_meco_phyla_RA <- PD_meco_phyla_RA %>% left_join(dataset$sample_table)


#creating barplot
t1 <- trans_abund$new(dataset = dataset, taxrank = "Phylum", ntaxa = 10, groupmean = "antidepressant_use")
t1$plot_bar()


------------------------------------------------------------------------------------------------------------------------------------------------------------
#Parsa: 03 Apr 2022 (12:09 PM)

##relative abundance analysis using microeco for control subjects based on 
##antidepressant use

# Load CRAN packages
library(here)
library(tidyverse)
library(microeco)
library(cowplot)
library(file2meco)
library(picante)
library(GUniFrac)
library(ggalluvial)
library(ggh4x)
library(ggpubr)
library(randomForest)
library(igraph)
library(rgexf)
library(htmlwidgets)


# set.seed is used to fix the random number generation to make the results repeatable
set.seed(711)
theme_set(theme_bw(base_size=18))
calculate_relative_abundance <- function(x) x/sum(x)


#importing qza artifacts to microeco 
(meco_data<-qiime2meco(
  ASV_data=here::here("table.qza"),
  phylo_tree=here::here("rooted-tree.qza"),
  taxonomy_data=here::here("taxonomy.qza"),
  sample_data = here::here("meco-metadata.txt")))


#Create a dummy copy of our dataset
(dataset_control <- clone(meco_data))

#Define our minimum sequencing depth of 6000
min_seq_depth <- 6000
#Check whether all samples have more reads than the minimum threshold
all(dataset_control$sample_sums() > min_seq_depth)

#Use select_if to select only columns that evaluate to TRUE
dataset_control$otu_table <- dataset_control$otu_table %>% select_if(dataset_control$sample_sums() > min_seq_depth)

#Use $tidy_dataset() to propagate changes everywhere and make sure all components are aligned
dataset_control$tidy_dataset()

#Filter the samples to only include PD patients that do not have NA for antidepressant use
#Use dplyr to do that
dataset_control$sample_table <- dataset_control$sample_table %>% filter(`Disease` == "Control", antidepressant_use != "NA")
#tidy the whole microtable
dataset_control$tidy_dataset()


#Calculate relative abundance
dataset_control$cal_abund(rel=T)


#removing low abundant features
control_meco_total_counts <- dataset_control$taxa_sums()
control_meco_relative_abundance <- calculate_relative_abundance(control_meco_total_counts)


control_meco_abundant <- control_meco_relative_abundance > 0.0005 
#Filter out the features that don't pass the check
dataset_control$otu_table <- dataset_control$otu_table [control_meco_abundant,]
#Propagate the changes across the dataset
dataset_control$tidy_dataset()

#setting taxonomic level to phylum

#extract relative abundance information on phyla
control_meco_phyla_RA <- dataset_control$taxa_abund$Phylum %>% as_tibble(rownames = "Feature_Taxonomy")

#turning table from wide to long form
control_meco_phyla_RA <- control_meco_phyla_RA %>% pivot_longer(-Feature_Taxonomy, values_to = "Relative_Abundance", names_to = "SampleID")
#Now we can join this with the metadata and plot
control_meco_phyla_RA <- control_meco_phyla_RA %>% left_join(dataset_control$sample_table)


#creating barplot
t1_control <- trans_abund$new(dataset = dataset_control, taxrank = "Phylum", ntaxa = 10, groupmean = "antidepressant_use")
t1_control$plot_bar()

---------------------------------------------------------------------------------------------------------------------------------------------------------
##relative abundance analysis using microeco for all subjects based on 
##disease state and antidepressant use status


# Load CRAN packages
library(here)
library(tidyverse)
library(microeco)
library(cowplot)
library(file2meco)
library(picante)
library(GUniFrac)
library(ggalluvial)
library(ggh4x)
library(ggpubr)
library(randomForest)
library(igraph)
library(rgexf)
library(htmlwidgets)


# set.seed is used to fix the random number generation to make the results repeatable
set.seed(711)
theme_set(theme_bw(base_size=18))
calculate_relative_abundance <- function(x) x/sum(x)


#importing qza artifacts to microeco 
(meco_merge_data<-qiime2meco(
  ASV_data=here::here("table.qza"),
  phylo_tree=here::here("rooted-tree.qza"),
  taxonomy_data=here::here("taxonomy.qza"),
  sample_data = here::here("meco_merge_metadata.txt")))


#Create a dummy copy of our dataset
(dataset_merge <- clone(meco_merge_data))

#Define our minimum sequencing depth of 6000
min_seq_depth <- 6000
#Check whether all samples have more reads than the minimum threshold
all(dataset_merge$sample_sums() > min_seq_depth)

#Use select_if to select only columns that evaluate to TRUE
dataset_merge$otu_table <- dataset_merge$otu_table %>% select_if(dataset_merge$sample_sums() > min_seq_depth)

#Use $tidy_dataset() to propagate changes everywhere and make sure all components are aligned
dataset_merge$tidy_dataset()


#tidy the whole microtable
dataset_merge$tidy_dataset()


#Calculate relative abundance
dataset_merge$cal_abund(rel=T)


#removing low abundant features
merge_meco_total_counts <- dataset_merge$taxa_sums()
merge_meco_relative_abundance <- calculate_relative_abundance(merge_meco_total_counts)


merge_meco_abundant <- merge_meco_relative_abundance > 0.0005 
#Filter out the features that don't pass the check
dataset_merge$otu_table <- dataset_merge$otu_table [merge_meco_abundant,]
#Propagate the changes across the dataset
dataset_merge$tidy_dataset()

#setting taxonomic level to family

#extract relative abundance information based on family
merge_meco_family_RA <- dataset_merge$taxa_abund$Family %>% as_tibble(rownames = "Feature_Taxonomy")

#turning table from wide to long form
merge_meco_family_RA <- merge_meco_family_RA %>% pivot_longer(-Feature_Taxonomy, values_to = "Relative_Abundance", names_to = "SampleID")
#Now we can join this with the metadata and plot
merge_meco_family_RA <- merge_meco_family_RA %>% left_join(dataset_merge$sample_table)


#creating barplot
t1_merge <- trans_abund$new(dataset = dataset_merge, taxrank = "Family", ntaxa = 10, groupmean = "merged")
t1_merge$plot_bar()


--------------------------------------------------------------------------------------------------------------------------------------------------------------

#Steven April 3 (7:38pm)
#Creating Figure 1: 4 way Venn diagram for taxonomic analysis

#Install CRAN packages
install.packages("microeco")
install.packages("file2meco")
install.packages("picante")
install.packages("GUniFrac")
install.packages("ggalluvial")
install.packages("ggh4x")
install.packages("ggpubr")
install.packages("randomForest")
install.packages("igraph")
install.packages("rgexf")
install.packages("htmlwidgets")

# Load CRAN packages
library(here)
library(tidyverse)
library(microeco)
library(cowplot)
library(file2meco)
library(picante)
library(GUniFrac)
library(ggalluvial)
library(ggh4x)
library(ggpubr)
library(randomForest)
library(igraph)
library(rgexf)
library(htmlwidgets)

# set.seed is used to fix the random number generation to make the results repeatable
set.seed(711)
theme_set(theme_bw(base_size=18))
calculate_relative_abundance <- function(x) x/sum(x)

#qiime2 to microeco using qiime2meco
(meco_data<-qiime2meco(ASV_data=here::here("Project 2 files","table.qza"),phylo_tree=here::here("Project 2 files","rooted-tree.qza"),
taxonomy_data=here::here("Project 2 files","taxonomy.qza"),sample_data = here::here("Project 2 files","metadata.tsv")))

#remove NA, selecting for columns of use and concatenating them
meco_data$sample_table <- meco_data$sample_table%>% filter(antidepressant_use == "yes" | antidepressant_use == "no" ) %>% select(Disease, antidepressant_use) %>% unite('Merged', Disease:antidepressant_use)

#tidying the whole dataset 
#tidy the whole microtable
meco_data$tidy_dataset()

#merging samples to create 4 unique rows: PD(Yes), PD(No), Control(Yes), Control(No)
meco_data1 <- meco_data$merge_samples(use_group = "Merged")

# create trans_venn object and visualizing it 
t2 <- trans_venn$new(meco_data1, ratio = "numratio")
t2$plot_venn()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#Claudia April 12th at 5:00 pm
###Indicator taxa analysis in control patients that use antidepressants 

#MICB 447 script for Indicator taxa analysis_Control_Samples
library(tidyverse)
library(phyloseq)
# library(labdsv) # Only need to uncomment this if using the older indicator taxa function at the bottom
library(indicspecies)
#Used to install qiime2R (next two lines)
#install.packages("remotes")
#remotes::install_github("jbisanz/qiime2R")
library(qiime2R)

# Function to group asv table by higher order taxonomy
# This is actually good to do at the species level (rank 7) to make it easier to interpret
group_by_taxonomy = function(asv_table, taxonomy, rank){
  asv_table = as.data.frame(asv_table)
  taxonomy = as.data.frame(taxonomy)
  taxonomy$ASV = rownames(taxonomy)
  asv_table$ASV = rownames(asv_table)
  asv_table = inner_join(taxonomy,asv_table,by="ASV")
  asv_table$taxa = apply(asv_table[,1:rank],1,paste,collapse=" ")
  asv_table = asv_table[,-(1:8)]
  asv_table = group_by(data.frame(asv_table),taxa)
  taxa_table = as.data.frame(summarise_all(asv_table,sum))
  rownames(taxa_table) = taxa_table$taxa
  return(taxa_table[,-1])
}

### Load data
physeq <- qza_to_phyloseq(features = "antidep-NA-filtered-out-table.qza",
                          tree = "rooted-tree.qza",
                          taxonomy = "taxonomy.qza", 
                          metadata = "indic_select_metadata.txt") 
taxa_table <- physeq@otu_table
taxonomy <- physeq@tax_table
metadata <- physeq@sam_data

### Create the taxa table for the rank of taxonomy required
# Kingdom=1, Phylum=2, Class=3, Order=4, Family=5, Genus=6, Species=7
# If you want to want to calculate indicator values by ASV just comment out the following line
taxa_table = group_by_taxonomy(taxa_table, taxonomy, 5)

### Calculate indicator values 
# change metadata column to 
indicator_multipatt = multipatt(t(taxa_table),metadata$antidepressant_use,duleg=T)


### Look at output
summary(indicator_multipatt)

### Write output to file (only writes significant indicators)
# Sometimes, if grouping by higher order taxa (e.g., species), the taxa name is too long
#   and the output pushes onto multiple lines. You can just zoom out your Rstudio window
#   and it should all work on one line again.
indicator_output = capture.output(summary(indicator_multipatt,indvalcomp=TRUE, alpha = 0.05))
write.table(indicator_output,file="indiciator_family_control.txt",row.names=F,quote=F)


# Older function in the labdsv package that is easier to filter/plot but harder make a nice table
# indicator_values = indval(t(taxa_table),metadata$body.site)

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#Claudia April 12th at 7:00 pm
###Indicator taxa analysis in PD patients that use antidepressants 

#MICB 447 script for Indicator taxa analysis PD patients
library(tidyverse)
library(phyloseq)
# library(labdsv) # Only need to uncomment this if using the older indicator taxa function at the bottom
library(indicspecies)
#Used to install qiime2R (next two lines)
#install.packages("remotes")
#remotes::install_github("jbisanz/qiime2R")
library(qiime2R)

# Function to group asv table by higher order taxonomy
# This is actually good to do at the species level (rank 7) to make it easier to interpret
group_by_taxonomy = function(asv_table, taxonomy, rank){
  asv_table = as.data.frame(asv_table)
  taxonomy = as.data.frame(taxonomy)
  taxonomy$ASV = rownames(taxonomy)
  asv_table$ASV = rownames(asv_table)
  asv_table = inner_join(taxonomy,asv_table,by="ASV")
  asv_table$taxa = apply(asv_table[,1:rank],1,paste,collapse=" ")
  asv_table = asv_table[,-(1:8)]
  asv_table = group_by(data.frame(asv_table),taxa)
  taxa_table = as.data.frame(summarise_all(asv_table,sum))
  rownames(taxa_table) = taxa_table$taxa
  return(taxa_table[,-1])
}

### Load data
physeq <- qza_to_phyloseq(features = "antidep-NA-filtered-out-table-PD.qza",
                          tree = "rooted-tree.qza",
                          taxonomy = "taxonomy.qza", 
                          metadata = "indic_select_metadata.txt") 
taxa_table <- physeq@otu_table
taxonomy <- physeq@tax_table
metadata <- physeq@sam_data

### Create the taxa table for the rank of taxonomy required
# Kingdom=1, Phylum=2, Class=3, Order=4, Family=5, Genus=6, Species=7
# If you want to want to calculate indicator values by ASV just comment out the following line
taxa_table = group_by_taxonomy(taxa_table, taxonomy, 5)

### Calculate indicator values 
# change metadata column to 
indicator_multipatt = multipatt(t(taxa_table),metadata$antidepressant_use,duleg=T)


### Look at output
summary(indicator_multipatt)

### Write output to file (only writes significant indicators)
# Sometimes, if grouping by higher order taxa (e.g., species), the taxa name is too long
#   and the output pushes onto multiple lines. You can just zoom out your Rstudio window
#   and it should all work on one line again.
indicator_output = capture.output(summary(indicator_multipatt,indvalcomp=TRUE, alpha = 0.05))
write.table(indicator_output,file="indiciator_family.txt",row.names=F,quote=F)


# Older function in the labdsv package that is easier to filter/plot but harder make a nice table
# indicator_values = indval(t(taxa_table),metadata$body.site)


----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#Claudia April 21st at 8:00 pm
###Indicator taxa analysis in both PD patients and control patients that use antidepressants 

#MICB 447 script for Indicator taxa analysis for both PD and Control
library(tidyverse)
library(phyloseq)
# library(labdsv) # Only need to uncomment this if using the older indicator taxa function at the bottom
library(indicspecies)
#Used to install qiime2R (next two lines)
#install.packages("remotes")
#remotes::install_github("jbisanz/qiime2R")
library(qiime2R)

# Function to group asv table by higher order taxonomy
# This is actually good to do at the species level (rank 7) to make it easier to interpret
group_by_taxonomy = function(asv_table, taxonomy, rank){
  asv_table = as.data.frame(asv_table)
  taxonomy = as.data.frame(taxonomy)
  taxonomy$ASV = rownames(taxonomy)
  asv_table$ASV = rownames(asv_table)
  asv_table = inner_join(taxonomy,asv_table,by="ASV")
  asv_table$taxa = apply(asv_table[,1:rank],1,paste,collapse=" ")
  asv_table = asv_table[,-(1:8)]
  asv_table = group_by(data.frame(asv_table),taxa)
  taxa_table = as.data.frame(summarise_all(asv_table,sum))
  rownames(taxa_table) = taxa_table$taxa
  return(taxa_table[,-1])
}

### Load data
physeq <- qza_to_phyloseq(features = "antidep-NA-filtered-out-both-PD-and-control-table.qza",
                          tree = "rooted-tree.qza",
                          taxonomy = "taxonomy.qza", 
                          metadata = "indic_metadata_filtered.tsv") 
taxa_table <- physeq@otu_table
taxonomy <- physeq@tax_table
metadata <- physeq@sam_data

### Create the taxa table for the rank of taxonomy required
# Kingdom=1, Phylum=2, Class=3, Order=4, Family=5, Genus=6, Species=7
# If you want to want to calculate indicator values by ASV just comment out the following line
taxa_table = group_by_taxonomy(taxa_table, taxonomy, 5)

### Calculate indicator values 
# change metadata column to 
indicator_multipatt = multipatt(t(taxa_table), 
                                metadata$combined_groups <- paste(metadata$Disease,
                                                                  metadata$antidepressant_use, sep="_"),duleg=T)
### Look at output
summary(indicator_multipatt)

### Write output to file (only writes significant indicators)
# Sometimes, if grouping by higher order taxa (e.g., species), the taxa name is too long
#   and the output pushes onto multiple lines. You can just zoom out your Rstudio window
#   and it should all work on one line again.
indicator_output = capture.output(summary(indicator_multipatt,indvalcomp=TRUE, alpha = 0.05))
write.table(indicator_output,file="indiciator_genus_PD_and_control.txt",row.names=F,quote=F)


# Older function in the labdsv package that is easier to filter/plot but harder make a nice table
# indicator_values = indval(t(taxa_table),metadata$body.site)