<QIIME2_Workflow.txt>
### QIIME2 Workflow
# Team 4 - Seohee An, Priya Gill,  Brandon Park, Chelsea Williams

# Import data
qiime tools import \
--type SampleData[SequencesWithQuality] \
--input-path /mnt/datasets/project_2/soil/soil_manifest.txt \
--output-path demux.qza \
--input-format SingleEndFastqManifestPhred33V2

# Demultiplex
qiime demux summarize \
--i-data demux.qza \
--o-visualization demux.qzv

# Denoise
qiime dada2 denoise-single \
--i-demultiplexed-seqs demux.qza \
--p-trim-left 0 \
--p-trunc-len 150 \
--o-representative-sequences rep-seqs-dada2.qza \
--o-table table-dada2.qza \
--o-denoising-stats stats-dada2.qza
 
qiime metadata tabulate \
--m-input-file stats-dada2.qza \
--o-visualization stats-dada2.qzv
mv rep-seqs-dada2.qza rep-seqs.qza
mv table-dada2.qza table.qza

# Filter 'NA' samples from metadata variables
qiime feature-table filter-samples \
--i-table table.qza \ 
--m-metadata-file /mnt/datasets/project_2/soil/soil_metadata.txt \
--p-where ‚Äú[Moisture Content]!=‚ÄôNA‚Äô 
AND [Total Carbon]!=‚ÄôNA‚Äô
AND [Total Nitrogen]!=‚ÄôNA‚Äô
AND [pH]!=‚ÄôNA‚Äô‚Äù
--o-filtered-table table_no_NA.qza

# Generate feature table
qiime feature-table summarize \
--i-table table_no_NA.qza \
--o-visualization table_no_NA.qzv \
--m-sample-metadata-file /mnt/datasets/project_2/soil/soil_metadata.txt
 
qiime feature-table tabulate-seqs \
--i-data rep-seqs.qza \
--o-visualization rep-seqs.qzv

# Create phylogenetic tree
qiime phylogeny align-to-tree-mafft-fasttree \
--i-sequences rep-seqs.qza \
--o-alignment aligned-rep-seqs.qza \
--o-masked-alignment masked-aligned-rep-seqs.qza \
--o-tree unrooted-tree.qza \
--o-rooted-tree rooted-tree.qza

# Core metrics
qiime diversity core-metrics-phylogenetic \
--i-phylogeny rooted-tree.qza \
--i-table table_no_NA.qza \
--p-sampling-depth 4800 \
--m-metadata-file /mnt/datasets/project_2/soil/soil_metadata.txt \
--output-dir core-metrics-results

# Alpha diversity analysis using the Shannon diversity index
qiime diversity alpha-group-significance \
--i-alpha-diversity core-metrics-results/shannon_vector.qza \
--m-metadata-file /mnt/datasets/project_2/soil/soil_metadata.txt  \
--o-visualization core-metrics-results/shannon-group-significance.qzv

# Alpha rarefaction
qiime diversity alpha-rarefaction \
--i-table table_no_NA.qza \
--i-phylogeny rooted-tree.qza \
--p-max-depth 10400 \
--m-metadata-file /mnt/datasets/project_2/soil/soil_metadata.txt \
--o-visualization alpha-rarefaction.qzv

# Download Greengenes 99% database 
wget "ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz"

gunzip gg_13_8_otus.tar.gz

tar -xvf gg_13_8_otus.tar

qiime tools import \
--type 'FeatureData[Sequence]' \
--input-path gg_13_8_otus/rep_set/99_otus.fasta \
--output-path ref-otus.qza 

qiime tools import \
--type 'FeatureData[Taxonomy]' \
--input-format HeaderlessTSVTaxonomyFormat \
--input-path gg_13_8_otus/taxonomy/99_otu_taxonomy.txt \
--output-path ref-taxonomy.qza

# Extract study-specific reference reads
qiime feature-classifier extract-reads \
--i-sequences ref-otus.qza \
--p-f-primer AGAGTTTGATCMTGGCTCAG \
--p-r-primer GWATTACCGCGGCKGCTG \
--p-trunc-len 150 \
--o-reads ref-seqs.qza

# Train classifier using reference reads
qiime feature-classifier fit-classifier-naive-bayes \
--i-reference-reads ref-seqs.qza \
--i-reference-taxonomy ref-taxonomy.qza \
--o-classifier classifier.qza

# Use trained classifier to the study sequences (taxonomy analysis)
qiime feature-classifier classify-sklearn \
--i-classifier classifier.qza \
--i-reads rep-seqs.qza \
--o-classification taxonomy.qza

qiime metadata tabulate \
--m-input-file taxonomy.qza \
--o-visualization taxonomy.qzv

qiime taxa barplot \
--i-table table_no_NA.qza \
--i-taxonomy taxonomy.qza \
--m-metadata-file /mnt/datasets/project_2/soil/soil_metadata.txt \
--o-visualization taxa-bar-plots.qzv

# ASV-based filtering (removing Archaea, chloroplasts, mitochondria, and ultra-rare ASVs), generating a new table
qiime feature-table filter-features \
--i-table table_no_NA.qza \
--p-min-frequency 500 \
--o-filtered-table feature-frequency-filtered-table.qza

qiime taxa filter-table \
--i-table table_no_NA.qza \
--i-taxonomy taxonomy.qza \
--p-exclude mitochondria \
--o-filtered-table table-no-mitochondria.qza

# Export data from QIIME2
qiime tools export \
--input-path table_no_NA.qza \ --output-path exported

qiime tools export \
--input-path taxonomy.qza \ --output-path exported

qiime tools export \
--input-path rooted-tree.qza \ --output-path exported

# Open file in a text editor on server
nano exported/taxonomy.tsv
# Edit the column names and change
`Feature ID` to `#OTUID`
`Taxon` to `taxonomy`
`Confidence` to `confidence`

biom add-metadata \ -i exported/feature-table.biom \ -o exported/table-with-taxonomy.biom \ --observation-metadata-fp exported/taxonomy.tsv \ --sc-separated taxonomy


<Correlation_Analysis_R.r>
Correlation_Analysis_R
### Correlation Analysis 
# Team 4 - Seohee An, Priya Gill, Brandon Park, Chelsea Williams

# Loading packages
library(tidyverse)
library(vegan)
library(phyloseq)
library(DESeq2)
library(ggplot2)

# importing all data files
biom_file <- import_biom("table-with-taxonomy.biom")
metadata <- import_qiime_sample_data("soil_metadata.txt")
tree <- read_tree_greengenes("tree-soil.nwk")

### Alpha diversity functions
# Shannon's diversity
shannons = function(x){
  present = x[x>0]
  p = present/sum(present)
  -sum(p*log(p))
}

# Pielou's evennenss function
evenness = function(x){
  present = x[x>0]
  p = present/sum(present)
  -sum(p*log(p))/log(sum(x>0))
}

# Richness function
richness = function(x){
  return(sum(x>0))
}

### Load data
biom = import_biom("table-with-taxonomy.biom")
taxa_table = otu_table(biom_file)
taxonomy = tax_table(biom_file)
metadata = read.table("soil_metadata.txt",sep="\t",header=T,row.names = 1)

# Select only samples with metadata
microbial_samples = colnames(taxa_table)
metadata_samples = rownames(metadata)
which_metadata = c()
for (i in 1:dim(taxa_table)[2]){
  which_metadata = c(which_metadata,which(metadata_samples == microbial_samples[i]))
}
metadata = metadata[which_metadata,]

### Calculate alpha diversity
metadata$richness = apply(taxa_table,2,richness)
metadata$shannons = apply(taxa_table,2,shannons)
metadata$evenness = apply(taxa_table,2,evenness)

### Correlation plots of Shannon's vs metadata variables
# pH
ggplot(metadata,aes(x=pH,y=shannons)) +
  geom_point() +
  labs(x = "pH", y = "Shannon's Diversity") +
  geom_smooth(method="lm",formula=y~x) +
  theme_bw(base_size = 15)
# Correlation
cor.test(metadata$pH,metadata$shannons)
# Regression
ph_shannons_lm = lm(shannons ~ pH, data = metadata)
summary(ph_shannons_lm)

# Total Nitrogen
ggplot(metadata,aes(x=Total.Nitrogen,y=shannons)) +
  geom_point() +
  labs(x = "Total Nitrogen", y = "Shannon's Diversity") +
  geom_smooth(method="lm",formula=y~x) +
  theme_bw(base_size = 15)
# Correlation
cor.test(metadata$Total.Nitrogen,metadata$shannons)
# Regression
nitrogen_shannons_lm = lm(shannons ~ Total.Nitrogen, data = metadata)
summary(nitrogen_shannons_lm)

# Total Carbon
ggplot(metadata,aes(x=Total.Carbon,y=shannons)) +
  geom_point() +
  labs(x = "Total Carbon", y = "Shannon's Diversity") +
  geom_smooth(method="lm",formula=y~x) +
  theme_bw(base_size = 15)
# Correlation
cor.test(metadata$Total.Carbon,metadata$shannons)
# Regression
carbon_shannons_lm = lm(shannons ~ Total.Carbon, data = metadata)
summary(carbon_shannons_lm)

# Moisture Content
ggplot(metadata,aes(x=Moisture.Content,y=shannons)) +
  geom_point() +
  labs(x = "Moisture Content", y = "Shannon's Diversity") +
  geom_smooth(method="lm",formula=y~x) +
  theme_bw(base_size = 15)
# Correlation
cor.test(metadata$Moisture.Content,metadata$shannons)
# Regression
moisture_shannons_lm = lm(shannons ~ Moisture.Content, data = metadata)
summary(moisture_shannons_lm)


### Correlation plots of Shannon's Diversity vs metadata variables by LTSP
# pH
ggplot(metadata,aes(x=pH,y=shannons)) +
  facet_grid(~LTSP.Treatment) +
  geom_point() +
  labs(x = "pH", y = "Shannon's Diversity") +
  geom_smooth(method="lm",formula=y~x) +
  theme_bw(base_size = 15)

ggplot(metadata,aes(x=pH,y=shannons,group=LTSP.Treatment,color=LTSP.Treatment)) +
  geom_point() +
  labs(x = "pH", y = "Shannon's Diversity") +
  geom_smooth(method="lm",formula=y~x) +
  theme_bw(base_size = 15)

for (g in unique(metadata$LTSP.Treatment)){
  print(paste("LTSP Treatment =",g))
  temp_metadata = filter(metadata,LTSP.Treatment==g)
  # Correlation
  temp_cor = cor.test(temp_metadata$pH,temp_metadata$shannons)
  print(paste("Correlation =",temp_cor$estimate,
              "p value =",temp_cor$p.value))
  # Regression
  om_ph_shannons_lm = lm(shannons ~ pH, data = temp_metadata)
  print(paste("Regression slope =",summary(om_ph_shannons_lm)$coefficients[2,1],
              "p value =",summary(om_ph_shannons_lm)$coefficients[2,4]))
}

# Total Nitrogen
ggplot(metadata,aes(x=Total.Nitrogen,y=shannons)) +
  facet_grid(~LTSP.Treatment) +
  geom_point() +
  labs(x = "Total Nitrogen", y = "Shannon's Diversity") +
  geom_smooth(method="lm",formula=y~x) +
  theme_bw(base_size = 15)

ggplot(metadata,aes(x=Total.Nitrogen,y=shannons,group=LTSP.Treatment,color=LTSP.Treatment)) +
  geom_point() +
  labs(x = "Total Nitrogen", y = "Shannon's Diversity") +
  geom_smooth(method="lm",formula=y~x) +
  theme_bw(base_size = 15)

for (g in unique(metadata$LTSP.Treatment)){
  print(paste("LTSP Treatment =",g))
  temp_metadata = filter(metadata,LTSP.Treatment==g)
  # Correlation
  temp_cor = cor.test(temp_metadata$Total.Nitrogen,temp_metadata$shannons)
  print(paste("Correlation =",temp_cor$estimate,
              "p value =",temp_cor$p.value))
  # Regression
  om_nitrogen_shannons_lm = lm(shannons ~ Total.Nitrogen, data = temp_metadata)
  print(paste("Regression slope =",summary(om_nitrogen_shannons_lm)$coefficients[2,1],
              "p value =",summary(om_nitrogen_shannons_lm)$coefficients[2,4]))
}

# Total Carbon
ggplot(metadata,aes(x=Total.Carbon,y=shannons)) +
  facet_grid(~LTSP.Treatment) +
  geom_point() +
  labs(x = "Total Carbon", y = "Shannon's Diversity") +
  geom_smooth(method="lm",formula=y~x) +
  theme_bw(base_size = 15)

ggplot(metadata,aes(x=Total.Carbon,y=shannons,group=LTSP.Treatment,color=LTSP.Treatment)) +
  geom_point() +
  labs(x = "Total Carbon", y = "Shannon's Diversity") +
  geom_smooth(method="lm",formula=y~x) +
  theme_bw(base_size = 15)

for (g in unique(metadata$LTSP.Treatment)){
  print(paste("LTSP Treatment =",g))
  temp_metadata = filter(metadata,LTSP.Treatment==g)
  # Correlation
  temp_cor = cor.test(temp_metadata$Total.Carbon,temp_metadata$shannons)
  print(paste("Correlation =",temp_cor$estimate,
              "p value =",temp_cor$p.value))
  # Regression
  om_carbon_shannons_lm = lm(shannons ~ Total.Carbon, data = temp_metadata)
  print(paste("Regression slope =",summary(om_carbon_shannons_lm)$coefficients[2,1],
              "p value =",summary(om_carbon_shannons_lm)$coefficients[2,4]))
}

# Moisture Content
ggplot(metadata,aes(x=Moisture.Content,y=shannons)) +
  facet_grid(~LTSP.Treatment) +
  geom_point() +
  labs(x = "Moisture Content", y = "Shannon's Diversity") +
  geom_smooth(method="lm",formula=y~x) +
  theme_bw(base_size = 15)

ggplot(metadata,aes(x=Moisture.Content,y=shannons,group=LTSP.Treatment,color=LTSP.Treatment)) +
  geom_point() +
  labs(x = "Moisture Content", y = "Shannon's Diversity") +
  geom_smooth(method="lm",formula=y~x) +
  theme_bw(base_size = 15)

for (g in unique(metadata$LTSP.Treatment)){
  print(paste("LTSP Treatment =",g))
  temp_metadata = filter(metadata,LTSP.Treatment==g)
  # Correlation
  temp_cor = cor.test(temp_metadata$Moisture.Content,temp_metadata$shannons)
  print(paste("Correlation =",temp_cor$estimate,
              "p value =",temp_cor$p.value))
  # Regression
  om_moisture_shannons_lm = lm(shannons ~ Moisture.Content, data = temp_metadata)
  print(paste("Regression slope =",summary(om_moisture_shannons_lm)$coefficients[2,1],
              "p value =",summary(om_moisture_shannons_lm)$coefficients[2,4]))
}


<Differential_Abundance_Analysis_R.r> 
### Differential Abundance Analysis
# Team 4 - Seohee An, Priya Gill,  Brandon Park, Chelsea Williams

# Load CRAN packages
library(tidyverse)
library(vegan)

# Load Bioconductor packages
library(phyloseq)
library(DESeq2)

# Calculate relative abundance
calculate_RA <- function(x) x/sum(x)

# Helper functions
gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# Export biom file and tree from QIIME2 and provide original metadata file
biom_file <- import_biom("table-with-taxonomy.biom")
metadata  <- import_qiime_sample_data("soil_metadata.txt")
tree      <- read_tree_greengenes("tree-soil.nwk")

# Combine all information into a single phyloseq object
physeq    <-  merge_phyloseq(biom_file, metadata, tree)

# Convert taxonomic rank from numbers to proper names
colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class", "Order",
                                 "Family", "Genus", "Species")

# Keep only abundant ASVs
# First determine counts of ASV across all samples
total_counts <- taxa_sums(physeq)

# Calculate relative abundance for each ASV
relative_abundance <- calculate_RA(total_counts)

# Determine which ASVs are more abundant than 0.1%
abundant <- relative_abundance > 0.001 
abundant_taxa <- prune_taxa(abundant, physeq)
# Check the resulting new phyloseq object with much fewer taxa
abundant_taxa

# Phyloseq object is called "Class"
class <- tax_glom(abundant_taxa, taxrank = "Class", NArm = FALSE)
class

# Only keep OM1 samples
OM1class <- subset_samples(class, LTSP.Treatment == "OM1")
OM1class

# Limit to OM1 samples that were filtered 
deseq_feature <- phyloseq_to_deseq2(OM1class, ~ LTSP.Treatment)
geo_means <- apply(counts(deseq_feature), 1, gm_mean)
deseq_feature <- estimateSizeFactors(deseq_feature, geoMeans = geo_means)
deseq_feature <- DESeq(deseq_feature, fitType="local")

feature_diff_abund <- results(deseq_feature)

# Define cutoff for the adjusted p-value
alpha <- 0.05
# Reformat information as data frame including feature as variable
significant_feature <- as_tibble(feature_diff_abund, rownames = "feature")
# Keep only significant results and sort by adjusted p-value
significant_feature <- filter(significant_feature, padj < alpha)
significant_feature <- arrange(significant_feature, padj)

# Get the taxonomic information as a data frame
taxa_df <- as_tibble(as.data.frame(tax_table(physeq)), rownames = "feature")

# Combine the significant features with taxonomic classification
significant_feature <- inner_join(significant_feature, taxa_df)

# There 14 features that are different at FDR-corrected p < 0.05!
dim(significant_feature)

# Plot differential abundance
significant_feature %>%
  ggplot(aes(x = log2FoldChange, y = Class)) +
  geom_col() +
  xlab("Log2 Fold Change") +
  # Label for y-axis
  ylab("Class") +
  theme_bw(base_size = 16)


# Convert to relative abundance and define taxrank as Class
physeq_RA <- transform_sample_counts(physeq, calculate_RA)
species_RA <- tax_glom(physeq_RA, taxrank = "Class", NArm = FALSE)

# Graph actinobacteria and filter for that class with subset_taxa 
actinobacteria <- subset_taxa(species_RA, Class == "c__Actinobacteria")

# Plot data by changing the table from wide to long format
actinobacteria_melt <- psmelt(actinobacteria)

# Plot with ggplot function
# Define what dataset to plot and what variable on x-axis and y-axis
ggplot(actinobacteria_melt, aes(x = as.factor(LTSP.Treatment), y = Abundance)) + 
  # Visualize data as Boxplot
  geom_boxplot(aes(fill = as.factor(LTSP.Treatment))) + 
  # Label for x-axis
  xlab("LTSP Treatment") +
  # Label for y-axis
  ylab("Relative Abundance") +
  # Plot title
  ggtitle("The Relative Abundance of Actinobacteria") + 
  # colors for the different groups 
  scale_fill_manual(values=c("gray42", "gray42", "gray42", "gray42")) +
  guides(fill = FALSE) +
  # Set a standard theme for plot
  theme_bw(base_size = 16)


<Beta_Diversity_Analysis_R.r>
### Beta Diversity Analysis
# Team 4 - Seohee An, Priya Gill, Brandon Park, Chelsea Williams

# Loading packages
library(tidyverse)
library(vegan)
library(phyloseq)
library(DESeq2)

# Helper functions
gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# Importing all data files
biom_file <- import_biom("table-with-taxonomy.biom")
metadata <- import_qiime_sample_data("soil_metadata.txt")
tree <- read_tree_greengenes("tree-soil.nwk")

# Combine all objects into phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)
# Overview of phyloseq object
physeq

# Use sample_data() to look at metadata of phyloseq object
# Look only at first 6 lines of table with head()
head(sample_data(physeq))

# Set set of random numbers
set.seed(600)

# taxonomic rank names
rank_names(physeq)
# Rename column names for taxonomic ranks
colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class",
                                 "Order", "Family", "Genus",
                                 "Species")
rank_names(physeq)

# Filter data based on metadata category with ==
OM1 <- subset_samples(physeq, LTSP.Treatment == "OM1")
OM1


# Filter out data based on metadata category with !=
no_OM1 <- subset_samples(physeq, LTSP.Treatment != "OM1")
no_OM1

# format data as table with as.data.frame()
as.data.frame(sample_names(OM1))
# Show read count for each sample using sample_sums()
as.data.frame(sample_sums(OM1))

# Exclude samples with less than 7000 reads
OM1_7000 <- prune_samples(sample_sums(OM1) >= 7000, OM1)
OM1_7000

# Beta diversity PCoA plot
# Diversity requires rarefied taxa tables
physeq_rar <- rarefy_even_depth(physeq, sample.size = 4800)

# Convert to RA (relative abundance)
physeq_rar_RA <- transform_sample_counts(physeq_rar, function(x) x/sum(x))

#  Define ordinate() and set method = PCoA and use weighted unifrac metric
ord <- ordinate(physeq_rar_RA, method = "PCoA",
                distance = "wunifrac")

# Plot data
plot_ordination(physeq_rar_RA,
                ord,
                type = "sample",
                color = "LTSP.Treatment",
                title = "PCoA (Weighted Unifrac)") +
  # Adding text to plot
  annotate(geom = "text",
           label = ".",
           x = - 0.025,
           y = 0.025,
           size = 4) + 
  # Manually adjust colours for points
  scale_colour_manual(values = c("red", "orange",
                                 "forestgreen", "blue"),
                      labels = c("OM1", "OM2", "OM3", "REF")) +
  stat_ellipse(type = "norm", size = 1) + 
  guides(colour = guide_legend("LTSP Treatment")) + 
  theme_bw(base_size = 14)