# Haq et al R Script

#Loading all necessary packages
library(tidyverse)
library(vegan)
library(ape)
library(phyloseq)
library(DESeq2)

#Differential Abundance for Infant Use of Probiotics
#Provide custom functions
calculate_relative_abundance <- function(x) x / sum(x)

calculate_gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x [x > 0]), na.mr = na.rm) / length (x))
}

set.seed(711)

biom_file <- import_biom("table-with-taxonomy.biom")
metadata  <- import_qiime_sample_data("infant_metadata.tsv")
tree      <- read_tree_greengenes("tree.nwk")

# Convert from multichotomous to dichotmous tree
tree <- multi2di(tree)


# Combine all information into a single phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)

#Look at first 6 rows of metadata
head(sample_data(physeq))

#Assign new taxonomic rank column names
colnames(tax_table(physeq)) <- c("Kingdom","Phylum", "Class", 
                                 "Order", "Family", "Genus",
                                 "Species")

head(tax_table(physeq))

#Filtering by depth

sample_sums(physeq)
sample_sums(physeq) >= 1000

at_least_1000 <- prune_samples(sample_sums(physeq) >= 1000, physeq)
sample_sums(at_least_1000)


#Differential abundance analysis
#Filtering based on metadata
infant <- subset_samples(at_least_1000, life_stage == "Infant")
infant <- subset_samples(infant, probiotic_inf != "not collected")



#Removing low-abundant features (step 5)
infant_counts <- taxa_sums(infant)

relative_abundance_infant <- calculate_relative_abundance(infant_counts)
abundant_infant <- relative_abundance_infant > 0.0005
abundant_infant_taxa <- prune_taxa(abundant_infant, infant)

#Set taxanomic level (step 6) 
abundant_infant_genera <- tax_glom(abundant_infant_taxa, taxrank = "Genus")

sample_data(abundant_infant_genera)$probiotic_inf <- 
  factor (sample_data(abundant_infant_genera)$probiotic_inf, 
          levels = c("yes", "no"))

#Discussion help
infant <- prune_samples(sample_sums(infant) > 0, infant)


#DESeq2 analysis 
deseq_infant <- phyloseq_to_deseq2(abundant_infant_genera, 
                                   ~ probiotic_inf)

geo_means <- apply(counts(deseq_infant), 1, calculate_gm_mean)

#Make sure to filter out low quality
deseq_infant <- estimateSizeFactors(deseq_infant, 
                                    geoMeans = geo_means)



deseq_infant <- DESeq(deseq_infant, fitType = "local")

infant_diff_abund <- results(deseq_infant)

#Determine significant results
alpha <- 0.05
significant_infant <- as.data.frame(infant_diff_abund)
significant_infant <- filter(significant_infant, padj < alpha)

#Add taxonomic information
genera_df <- as.data.frame(tax_table(abundant_infant_genera))
significant_infant <- merge(significant_infant, genera_df, 
                            by = "row.names")
significant_infant <- arrange(significant_infant, log2FoldChange)
significant_infant <- mutate(significant_infant, 
                             Genus = factor(Genus, levels = Genus))

#Plot
ggplot(significant_infant, aes(x = log2FoldChange, 
                               y = Genus)) + 
  geom_bar(stat = "identity") + 
  labs(title = "Differential abundant genera in infant probiotic use", 
       x = "log2 fold-change",
       y = "Genus") + 
  theme_bw()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  geom_bar(stat="identity", fill="#FF6565", colour="black")




#Differential Abundance for Mother Use of Probiotics 
#Provide custom functions
calculate_relative_abundance <- function(x) x / sum(x)

calculate_gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x [x > 0]), na.mr = na.rm) / length (x))
}

set.seed(711)

biom_file <- import_biom("table-with-taxonomy.biom")
metadata  <- import_qiime_sample_data("infant_metadata.tsv")
tree      <- read_tree_greengenes("tree.nwk")

# Convert from multichotomous to dichotmous tree
tree <- multi2di(tree)


# Combine all information into a single phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)

#Look at first 6 rows of metadata
head(sample_data(physeq))

#Assign new taxonomic rank column names
colnames(tax_table(physeq)) <- c("Kingdom","Phylum", "Class", 
                                 "Order", "Family", "Genus",
                                 "Species")

head(tax_table(physeq))

#Filtering by depth

sample_sums(physeq)
sample_sums(physeq) >= 1000

at_least_1000 <- prune_samples(sample_sums(physeq) >= 1000, physeq)
sample_sums(at_least_1000)


#Differential abundance analysis
#Filtering based on metadata
infant <- subset_samples(at_least_1000, life_stage == "Infant")
infant <- subset_samples(infant, probiotic_mom != "not collected")



#Removing low-abundant features (step 5)
infant_counts <- taxa_sums(infant)

relative_abundance_infant <- calculate_relative_abundance(infant_counts)
abundant_infant <- relative_abundance_infant > 0.0005
abundant_infant_taxa <- prune_taxa(abundant_infant, infant)

#Set taxanomic level (step 6) 
abundant_infant_genera <- tax_glom(abundant_infant_taxa, taxrank = "Genus")

sample_data(abundant_infant_genera)$probiotic_mom <- 
  factor (sample_data(abundant_infant_genera)$probiotic_mom, 
          levels = c("yes", "no"))

#Discussion help
infant <- prune_samples(sample_sums(infant) > 0, infant)


#DESeq2 analysis 
deseq_infant <- phyloseq_to_deseq2(abundant_infant_genera, 
                                   ~ probiotic_mom)

geo_means <- apply(counts(deseq_infant), 1, calculate_gm_mean)

#Make sure to filter out low quality
deseq_infant <- estimateSizeFactors(deseq_infant, 
                                    geoMeans = geo_means)



deseq_infant <- DESeq(deseq_infant, fitType = "local")

infant_diff_abund <- results(deseq_infant)

#Determine significant results
alpha <- 0.05
significant_infant <- as.data.frame(infant_diff_abund)
significant_infant <- filter(significant_infant, padj < alpha)

#Add taxonomic information
genera_df <- as.data.frame(tax_table(abundant_infant_genera))
significant_infant <- merge(significant_infant, genera_df, 
                            by = "row.names")
significant_infant <- arrange(significant_infant, log2FoldChange)
significant_infant <- mutate(significant_infant, 
                             Genus = factor(Genus, levels = Genus))

#Plot
ggplot(significant_infant, aes(x = log2FoldChange, 
                               y = Genus)) + 
  geom_bar(stat = "identity") + 
  labs(title = "Differential abundant genera in mother probiotic use", 
       x = "log2 fold-change",
       y = "Genus") + 
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  geom_bar(stat="identity", fill="#FF6565", colour="black")



#Differential Abundance for the 1-14-27 cohort
#Provide custom functions
calculate_relative_abundance <- function(x) x / sum(x)

calculate_gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x [x > 0]), na.mr = na.rm) / length (x))
}

set.seed(711)

biom_file <- import_biom("table-with-taxonomy3.biom")
metadata  <- import_qiime_sample_data("infant_metadata.tsv")
tree      <- read_tree_greengenes("tree.nwk")

# Convert from multichotomous to dichotmous tree
tree <- multi2di(tree)


# Combine all information into a single phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)

#Look at first 6 rows of metadata
head(sample_data(physeq))

#Assign new taxonomic rank column names
colnames(tax_table(physeq)) <- c("Kingdom","Phylum", "Class", 
                                 "Order", "Family", "Genus",
                                 "Species")

head(tax_table(physeq))

#Filtering by depth

sample_sums(physeq)
sample_sums(physeq) >= 1000

at_least_1000 <- prune_samples(sample_sums(physeq) >= 1000, physeq)

#Differential abundance analysis
#Filtering based on metadata
infant <- subset_samples(physeq, life_stage == "Infant")
infant <- subset_samples(infant, anonymized_name == "70001" || "70014" || "70027")
infant <- subset_samples(infant, age_category == "0.5 months" || "6 months")
infant <- subset_samples(infant, age_category != "4 months")
infant <- subset_samples(infant, age_category != "9 months")
infant <- subset_samples(infant, age_category != "2 months")



#Removing low-abundant features (step 5)
infant_counts <- taxa_sums(infant)

relative_abundance_infant <- calculate_relative_abundance(infant_counts)
abundant_infant <- relative_abundance_infant > 0.0005
abundant_infant_taxa <- prune_taxa(abundant_infant, infant)

#Set taxanomic level (step 6) 
abundant_infant_genera <- tax_glom(abundant_infant_taxa, taxrank = "Genus")

sample_data(abundant_infant_genera)$age_category <- 
  factor (sample_data(abundant_infant_genera)$age_category, 
          levels = c("0.5 months", "6 months"))

#DESeq2 analysis NOT WORKING
deseq_infant <- phyloseq_to_deseq2(abundant_infant_genera, 
                                   ~ age_category)

geo_means <- apply(counts(deseq_infant), 1, calculate_gm_mean)
deseq_infant <- estimateSizeFactors(deseq_infant, 
                                    geoMeans = geo_means)

deseq_infant <- DESeq(deseq_infant, fitType = "local")

infant_diff_abund <- results(deseq_infant)

#Determine significant results
alpha <- 0.05
significant_infant <- as.data.frame(infant_diff_abund)
significant_infant <- filter(significant_infant, padj < alpha)

#Add taxonomic information
genera_df <- as.data.frame(tax_table(abundant_infant_genera))
significant_infant <- merge(significant_infant, genera_df, 
                            by = "row.names")
significant_infant <- arrange(significant_infant, log2FoldChange)
significant_infant <- mutate(significant_infant, 
                             Genus = factor(Genus, levels = Genus))

ggplot(significant_infant, aes(x = log2FoldChange, 
                               y = Genus)) + 
  geom_bar(stat = "identity") + 
  labs(title = "differential abundant genera - 0.5 vs 6 month", 
       x = "log2 fold-change",
       y = "Genus") + 
  theme_bw()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  geom_bar(stat="identity", fill="#F4D7EE", colour="black")





#Relative Abundance for Infant Use of Probiotics
#Provide custom functions
calculate_relative_abundance <- function(x) x / sum(x)

calculate_gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x [x > 0]), na.mr = na.rm) / length (x))
}

set.seed(711)

biom_file <- import_biom("table-with-taxonomy.biom")
metadata  <- import_qiime_sample_data("infant_metadata.tsv")
tree      <- read_tree_greengenes("tree.nwk")

# Convert from multichotomous to dichotmous tree
tree <- multi2di(tree)


# Combine all information into a single phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)

#Assign new taxonomic rank column names
colnames(tax_table(physeq)) <- c("Kingdom","Phylum", "Class", 
                                 "Order", "Family", "Genus",
                                 "Species")

head(tax_table(physeq))

#Filtering by depth

sample_sums(physeq)
sample_sums(physeq) >= 1000

at_least_1000 <- prune_samples(sample_sums(physeq) >= 1000, physeq)
sample_sums(at_least_1000)

#Step 2 filtering by metadata
infant <- subset_samples(physeq, life_stage == "Infant")
infant <- subset_samples(infant, probiotic_inf != "not collected")

#Step 4 Calculate relative abundance
infant_RA <- transform_sample_counts(infant,
                                     calculate_relative_abundance)

#Step 5 remove low abundant taxa
infant_counts <- taxa_sums(infant)
relative_abundance_infant <- calculate_relative_abundance((infant_counts))
abundant_infant <- relative_abundance_infant > 0.0005
abundant_infant_RA_taxa <- prune_taxa(abundant_infant, 
                                      infant_RA)

#Step 6 Tax glom
abundant_infant_RA_genera <- tax_glom(abundant_infant_RA_taxa, 
                                      taxrank = "Genus")

#Choose different genera across code to see appropriate results
stenotrophomonas <- subset_taxa(abundant_infant_RA_genera, 
                                Genus == "g__Stenotrophomonas")
clostridioides <- subset_taxa(abundant_infant_RA_genera, 
                              Genus == "g__Clostridioides")
akkermansia <- subset_taxa(abundant_infant_RA_genera, 
                           Genus == "g__Akkermansia")
erysipelatoclostridium <- subset_taxa(abundant_infant_RA_genera, 
                                      Genus == "g__Erysipelatoclostridium")
haemophilus <- subset_taxa(abundant_infant_RA_genera, 
                           Genus == "g__Haemophilus")
escherichia_shigella <- subset_taxa(abundant_infant_RA_genera, 
                                    Genus == "g__Escherichia-Shigella")

otu_table(haemophilus)


#Transfrom data into data frame
as.data.frame(otu_table(haemophilus))

#Rearrange data from columns along rows
haemophilus_long <- psmelt(haemophilus)

#Plot data with ggplot, when bracketed with parenthesis
#gets plotted in viewer and saved to variable
#Aesthetics define how our data are represented
p <- ggplot(haemophilus_long, aes(x = probiotic_inf, 
                                  y = Abundance,
                                  fill = probiotic_inf)) +
  #Geom define how our data is represented as a geometric shape
  geom_boxplot() + 
  labs(title = "Relative abundance of Haemophilus", 
       x = "Infant probiotic use", 
       y = "Relative abundance")+
  theme_bw()+
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(colour = "black"))

p

#checking with smaller scales
p + lims(y = c(0, 0.0000001))





#Relative Abundance for Mother Use of Probiotics
#Provide custom functions
calculate_relative_abundance <- function(x) x / sum(x)

calculate_gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x [x > 0]), na.mr = na.rm) / length (x))
}

set.seed(711)

biom_file <- import_biom("table-with-taxonomy.biom")
metadata  <- import_qiime_sample_data("infant_metadata.tsv")
tree      <- read_tree_greengenes("tree.nwk")

# Convert from multichotomous to dichotmous tree
tree <- multi2di(tree)


# Combine all information into a single phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)

#Assign new taxonomic rank column names
colnames(tax_table(physeq)) <- c("Kingdom","Phylum", "Class", 
                                 "Order", "Family", "Genus",
                                 "Species")

head(tax_table(physeq))

#Filtering by depth

sample_sums(physeq)
sample_sums(physeq) >= 1000

at_least_1000 <- prune_samples(sample_sums(physeq) >= 1000, physeq)
sample_sums(at_least_1000)

#Step 2 filtering by metadata
infant <- subset_samples(physeq, life_stage == "Infant")
infant <- subset_samples(infant, probiotic_mom != "not collected")

#Step 4 Calculate relative abundance
infant_RA <- transform_sample_counts(infant,
                                     calculate_relative_abundance)

#Step 5 remove low abundant taxa
infant_counts <- taxa_sums(infant)
relative_abundance_infant <- calculate_relative_abundance((infant_counts))
abundant_infant <- relative_abundance_infant > 0.0005
abundant_infant_RA_taxa <- prune_taxa(abundant_infant, 
                                      infant_RA)

#Step 6 Tax glom
abundant_infant_RA_genera <- tax_glom(abundant_infant_RA_taxa, 
                                      taxrank = "Genus")

#Choose different genera across code to see appropriate results
acinetobacter <- subset_taxa(abundant_infant_RA_genera, 
                             Genus == "g__Acinetobacter")
collinsella <- subset_taxa(abundant_infant_RA_genera, 
                           Genus == "g__Collinsella")
akkermansia <- subset_taxa(abundant_infant_RA_genera, 
                           Genus == "g__Akkermansia")

otu_table(akkermansia)

#Transfrom data into data frame
as.data.frame(otu_table(akkermansia))

#Rearrange data from columns along rows
akkermansia_long <- psmelt(akkermansia)

#Plot data with ggplot, when bracketed with parenthesis
#gets plotted in viewer and saved to variable
#Aesthetics define how our data are represented
(p <- ggplot(akkermansia_long, aes(x = probiotic_mom, 
                                   y = Abundance,
                                   fill = probiotic_mom)) +
    #Geom define how our data is represented as a geometric shape
    geom_boxplot() + 
    labs(title = "Relative abundance of Akkermansia", 
         x = "Mother probiotic use", 
         y = "Relative abundance"))+
  theme_bw()+
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(colour = "black"))





#Relative Abundance for 1-14-27 cohort of similar probiotic use
calculate_gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x [x > 0]), na.mr = na.rm) / length (x))
}

set.seed(711)

biom_file <- import_biom("table-with-taxonomy3.biom")
metadata  <- import_qiime_sample_data("infant_metadata.tsv")
tree      <- read_tree_greengenes("tree.nwk")

# Convert from multichotomous to dichotmous tree
tree <- multi2di(tree)


# Combine all information into a single phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)

#Assign new taxonomic rank column names
colnames(tax_table(physeq)) <- c("Kingdom","Phylum", "Class", 
                                 "Order", "Family", "Genus",
                                 "Species")

head(tax_table(physeq))

#Filtering by depth

sample_sums(physeq)
sample_sums(physeq) >= 1000

at_least_1000 <- prune_samples(sample_sums(physeq) >= 1000, physeq)
sample_sums(at_least_1000)

#Step 2 filtering by metadata
infant <- subset_samples(physeq, life_stage == "Infant")
infant <- subset_samples(infant, probiotic_inf != "not collected")
infant <- subset_samples(infant, anonymized_name == "70001" || "70014" || "70027")
infant <- subset_samples(infant, age_category == "0.5 months" || "6 months")
infant <- subset_samples(infant, age_category != "4 months")
infant <- subset_samples(infant, age_category != "9 months")
infant <- subset_samples(infant, age_category != "2 months")


#Step 4 Calculate relative abundance
infant_RA <- transform_sample_counts(infant,
                                     calculate_relative_abundance)

#Step 5 remove low abundant taxa
infant_counts <- taxa_sums(infant)
relative_abundance_infant <- calculate_relative_abundance((infant_counts))
abundant_infant <- relative_abundance_infant > 0.0005
abundant_infant_RA_taxa <- prune_taxa(abundant_infant, 
                                      infant_RA)

#Step 6 Tax glom
abundant_infant_RA_genera <- tax_glom(abundant_infant_RA_taxa, 
                                      taxrank = "Genus")

#Choose different genera across code to see appropriate results
clostridioides <- subset_taxa(abundant_infant_RA_genera, 
                              Genus == "g__Clostridioides")
akkermansia <- subset_taxa(abundant_infant_RA_genera, 
                           Genus == "g__Akkermansia")
bacteroides <- subset_taxa(abundant_infant_RA_genera, 
                           Genus == "g__Bacteroides")
bifidobacterium <- subset_taxa(abundant_infant_RA_genera, 
                               Genus == "g__Bifidobacterium")
clostridium_sensu_stricto_1 <- subset_taxa(abundant_infant_RA_genera, 
                                           Genus == "g__Clostridium_sensu_stricto_1")

otu_table(bacteroides)


#Transfrom data into data frame
as.data.frame(otu_table(bacteroides))

#Rearrange data from columns along rows
bacteroides_long <- psmelt(bacteroides)

#Plot data with ggplot, when bracketed with parenthesis
#gets plotted in viewer and saved to variable
#Aesthetics define how our data are represented
p <- ggplot(bacteroides_long, aes(x = age_category, 
                                  y = Abundance,
                                  fill = age_category)) +
  #Geom define how our data is represented as a geometric shape
  geom_boxplot() + 
  labs(title = "Relative abundance of Bacteroides", 
       x = "Time point", 
       y = "Relative abundance")+
  theme_bw()+
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(colour = "black"))
p

#checking with smaller scales
p + lims(y = c(0, 0.001))