# Generating differential and relative abundance plots in R.

# Working in R

# Load CRAN packages
library(tidyverse)
library(vegan)
library(ape)

# Load Bioconductor packages
library(phyloseq)
library(DESeq2)

# Calculate relative abundance
calculate_relative_abundance <- function(x) x / sum(x)

# Calculate geometric mean
calculate_gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))}

set.seed(711)

# Export biom file and tree from QIIME2 and provide original metadata file
Q2_biom_file <- import_biom("Q2-table-with-taxonomy.biom")
metadata  <- import_qiime_sample_data("no_duplicates_metadata.tsv")
tree      <- read_tree_greengenes("tree.nwk")

# Convert from multichotomous to dichotmous tree
tree <- multi2di(tree)

# Combine all information into a single phyloseq object
Q2_physeq <- merge_phyloseq(Q2_biom_file, metadata, tree)

# Receive basic metrics of data
Q2_physeq

# Check the number of samples or taxa (ASVs)
head(sample_data(Q2_physeq))

# Explore taxonomic information contained in physeq files
head(tax_table(Q2_physeq))

# Convert taxonomic rank from numbers to proper names (assign new taxonomic column names)
colnames(tax_table(Q2_physeq)) <- c("Kingdom", "Phylum", "Class","Order", "Family",
                                    "Genus", "Species")
head(tax_table(Q2_physeq))

# Determine threshold for low number of reads (pruning)
sort(sample_sums(Q2_physeq), decreasing = T)

# Prune to 1533 reads (filter out samples with less than 1533 reads)
at_least_1533<- prune_samples(sample_sums(Q2_physeq) >= 1533, Q2_physeq)
sample_sums(at_least_1533)

# Differential abundance analysis of probiotics vs. no probiotics

## Keep features with a relative abundance higher than the threshold 0.0005
Q2_total_counts <- taxa_sums(at_least_1533)
Q2_relative_abundance <- calculate_relative_abundance(Q2_total_counts)
abundant <- Q2_relative_abundance > 0.0005
Q2_abundant_taxa <- prune_taxa(abundant, at_least_1533)
Q2_abundant_taxa

## Set taxonomic level for analysis
Q2_abundant_genera <- tax_glom(Q2_abundant_taxa, taxrank = "Genus")
Q2_abundant_genera

## Compare “yes” to “no” probiotics
sample_data(Q2_abundant_genera)$probiotic <-
  factor(sample_data(Q2_abundant_genera)$probiotic,
         levels = c("no", "yes"))

## Create a DESeq2 object with phyloseq_to_deseq2()
deseq_Q2 <- phyloseq_to_deseq2(Q2_abundant_genera, ~ probiotic)
geo_means <- apply(counts(deseq_Q2), 1, calculate_gm_mean)
deseq_Q2_estimate <- estimateSizeFactors(deseq_Q2, geoMeans = geo_means)
deseq_Q2_DESeq <- DESeq(deseq_Q2_estimate, fitType = "local")

Q2_diff_abund <- results(deseq_Q2_DESeq)

## Define the alpha level to determine significant changes
alpha <- 0.05
significant <- as.data.frame(Q2_diff_abund)
significant_filter <- filter(significant, padj < alpha)

## Merge table with significant results with the table of taxonomic information
# and rearrange it to put the largest differences at the top
genera_df <- as.data.frame(tax_table(Q2_abundant_genera))
significant_merge <- merge(significant_filter, genera_df, by = "row.names")
significant_arrange <- arrange(significant_merge, log2FoldChange)

dim(significant_arrange)

significant_arrange

# Create a differential abundance analysis plot
significant_mutate <- mutate(significant_arrange,
                             Genus = factor(Genus, levels = Genus))

ggplot(significant_mutate, aes(x = log2FoldChange, y = Genus)) +
  geom_bar(stat = "identity") +
  labs(title = "Differential abundant genera",
       x = expression(log[2]~fold~change),
       y = "Genus") +
  theme_bw()

## Creating relative abundance plots
# Calculate relative abundance
Q2_physeq_RA <- transform_sample_counts(at_least_1533, calculate_relative_abundance)

# Remove low-abundant features
Q2_total_counts <- taxa_sums(at_least_1533)
Q2_relative_abundance <- calculate_relative_abundance(Q2_total_counts)

# Determine what sequences have a higher relative abundance than a threshold
# (0.005 or 0.1%, i.e. a ratio of 0.0005 or 0.001)
abundant <- Q2_relative_abundance > 0.0005
Q2_abundant_RA_taxa <- prune_taxa(abundant, Q2_physeq_RA)
Q2_abundant_RA_taxa

# tax_glom
Q2_abundant_RA_genera <- tax_glom(Q2_abundant_RA_taxa, taxrank = "Genus")

# Clostridium_sensu_stricto_1
Clostridium_sensu_stricto_1 <- subset_taxa(Q2_abundant_RA_genera, Genus == "g__Clostridium_sensu_stricto_1")
otu_table(Clostridium_sensu_stricto_1)

Clostridium_sensu_stricto_1_long <- psmelt(Clostridium_sensu_stricto_1)

Clostridium_sensu_stricto_1_plot <- ggplot(Clostridium_sensu_stricto_1_long, aes (x = probiotic, y = Abundance)) +
  geom_boxplot() +
  labs(title = "Relative abundance of Clostridium_sensu_stricto_1",
       x = "Probiotics", 
       y = "Relative abundance") + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))
Clostridium_sensu_stricto_1_plot

Clostridium_sensu_stricto_1_long %>% group_by(probiotic) %>% summarize(above_zero = sum(Abundance > 0))

# Collinsella
Collinsella <- subset_taxa(Q2_abundant_RA_genera, Genus == "g__Collinsella")
otu_table(Collinsella)

Collinsella_long <- psmelt(Collinsella)
Collinsella_long

Collinsella_plot <- ggplot(Collinsella_long, aes (x = probiotic, y = Abundance)) +
  geom_boxplot() +
  labs(title = "Relative abundance of Collinsella",
       x = "Probiotics",
       y = "Relative abundance") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
      panel.background = element_blank(), axis.line = element_line(colour = "black"))
Collinsella_plot

Collinsella_long %>% group_by(probiotic) %>% summarize(above_zero = sum(Abundance > 0))

# Acinetobacter
Acinetobacter <- subset_taxa(Q2_abundant_RA_genera, Genus == "g__Acinetobacter")
otu_table(Acinetobacter)

Acinetobacter_long <- psmelt(Acinetobacter)
Acinetobacter_long

Acinetobacter_plot <- ggplot(Acinetobacter_long, aes(x = probiotic, y = Abundance)) +
  geom_boxplot() +
  labs(title = "Relative abundance of Acinetobacter",
       x     = "Probiotics",
       y     = "Relative abundance") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))
Acinetobacter_plot

Acinetobacter_long %>% group_by(probiotic) %>% summarize(above_zero = sum(Abundance > 0))

# Erysipelatoclostridium
Erysipelatoclostridium <- subset_taxa(Q2_abundant_RA_genera, Genus == "g__Erysipelatoclostridium")
otu_table(Erysipelatoclostridium)

Erysipelatoclostridium_long <- psmelt(Erysipelatoclostridium)
Erysipelatoclostridium_long

Erysipelatoclostridium_long %>% group_by(probiotic) %>% summarize(above_zero = sum(Abundance > 0))

Erysipelatoclostridium_plot <- ggplot(Erysipelatoclostridium_long, aes(x = probiotic, y = Abundance)) +
  geom_boxplot() +
  labs(title = "Relative abundance of Erysipelatoclostridium",
       x     = "Probiotics",
       y     = "Relative abundance") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))
Erysipelatoclostridium_plot

Erysipelatoclostridium_long %>% group_by(probiotic) %>% summarize(above_zero = sum(Abundance > 0))

# Alpha and Beta analyses in QIIME2.

# PCoA plots in R 

# Combine all information into a single phyloseq object
Q2_physeq <- merge_phyloseq(Q2_biom_file, metadata, tree)

# Rarefy to normalize samples
physeq_rar <- rarefy_even_depth(Q2_physeq, sample.size = 21146)

# Set method to PCoA and set distance to the metric of interest 

# Weighted UniFrac
ord_weighted <- ordinate(physeq_rar, method = "PCoA", distance = "wunifrac")

# Unweighted UniFrac
ord_unweighted <- ordinate(physeq_rar, method = "PCoA", distance = "uunifrac")

# Plot weighted UniFrac
plot_ordination(physeq_rar,
                ord_weighted,
                color = "probiotic") +
  # Define title of plot
  labs(title = "PCoA (weighted UniFrac)") +
  theme_bw() +
  # Position an ellipses around each group of data, `type` determines how centre
  # is determined, `size` the thickness of the line
  stat_ellipse(type = "norm", size = 0.5) +
  # Manually adjust colours and labels 
  scale_colour_manual(values = c(“red”, “blue”),
                      labels = c("no", "yes")) +
  # Themes change the basic look of your plot
  # Rename the title of your legend
  guides(color = guide_legend("Probiotics"))

# Plot unweighted UniFrac
plot_ordination(physeq_rar,
                ord_unweighted,
                color = "probiotic") +
  # Define title of plot
  labs(title = "PCoA (unweighted UniFrac)") +
  theme_bw() +
  # Position an ellipses around each group of data, `type` determines how centre
  # is determined, `size` the thickness of the line
  stat_ellipse(type = "norm", size = 0.5) +
  # Manually adjust colours and labels
  scale_colour_manual(values = c("red", "blue"),
                      labels = c("no", "yes")) +
  # Themes change the basic look of your plot
  # Rename the title of your legend
  guides(color = guide_legend("Probiotics"))
