#ALL ANALYSES
#Start working on the server
ssh root@10.19.139.153

#Move into the data directory
cd /data

#Make directories for the research aims
mkdir Research_Aim_1
mkdir Research_Aim_2
mkdir Research_Aim_3A
mkdir Research_Aim_3B

#Import the data that has already been demultiplexed
 qiime tools import \
--type 'SampleData[SequencesWithQuality]' \
--input-path /mnt/datasets/project_2/tanzania/tanzania_manifest.txt \
--output-path demux.qza \
--input-format SingleEndFastqManifestPhred33V2

#Convert demux.qza to demuz.qzv to visualize the data and determine an appropriate sequence length using the interactive quality plot
 qiime demux summarize 
--i-data demux.qza 
--o-visualization demux.qzv

#Use stats-dada2.qzv to determine if we chose a length that includes a large portion of the reads of interest after sequences quality control using DADA2 
qiime dada2 denoise-single \
  --i-demultiplexed-seqs /data/demux.qza \
  --p-trim-left 6 \
  --p-trunc-len 150 \
  --o-representative-sequences rep-seqs-dada2_150.qza \
  --o-table table-dada2.qza \
  --o-denoising-stats stats-dada2.qza

qiime metadata tabulate \
  --m-input-file stats-dada2.qza \
  --o-visualization stats-dada2.qzv

#p-trunc-len 150 seems most appropriate for the data, and will therefore be used moving forward
#Create directory for denoised data and insert rep-seqs-dada2_150.qza and table-dada2.qza

mkdir denoise_150
mv rep-seqs-dada2_150.qza denoise_150
mv table-dada2.qza denoise_150

#Copied the metadata.txt file from the mnt directory to the data directory using
cd /mnt/datasets/project_2/tanzania/
#and
 cp tanzania_metadata.txt /data/

#Creating summaries of feature table and feature data using table.qza and rep-seqs.qza from sequences truncated
at 150 nt

#Change directory to denoise_150 where table.qza and rep-seqs.qza are located
cd /data/denoise_150

#Create a .qzv file for table.qza
qiime feature-table summarize \
  --i-table table-dada2.qza \
  --o-visualization table.qzv \
  --m-sample-metadata-file /data/tanzania_metadata.txt

#Create a visualization of rep-seqs.qza
qiime feature-table tabulate-seqs \
  --i-data rep-seqs-dada2_150.qza \
  --o-visualization rep-seqs_150.qzv

# Metadata-base filtering for longitudinal analysis
# Change working directory to Reasearch_Aim_1

# Filter for only the human fecal samples using table.qza in order to keep only the sample of interest. 
qiime feature-table filter-samples \
  --i-table /data/denoise_150/table-dada2.qza \
  --m-metadata-file /data/tanzania_metadata.txt \
  --p-where "[sample_type]='feces' AND [host_common_name]='human'" \
  --o-filtered-table human_feces-table.qza

# Create 2 data subsets for each host life stage: child and adult.
qiime feature-table filter-samples \
  --i-table human_feces-table.qza \
  --m-metadata-file /data/tanzania_metadata.txt \
  --p-where "[host_life_stage]='child'" \
  --o-filtered-table child_feces-table.qza

qiime feature-table filter-samples \
  --i-table human_feces-table.qza \
  --m-metadata-file /data/tanzania_metadata.txt \
  --p-where "[host_life_stage]='adult'" \
  --o-filtered-table adult_feces-table.qza

# Convert qza file to qzv to visualize the filtered table 
qiime feature-table summarize \
  --i-table child_feces-table.qza \
  --o-visualization child_feces-table.qzv \
  --m-sample-metadata-file /data/tanzania_metadata.txt

qiime feature-table summarize \
  --i-table adult_feces-table.qza \
  --o-visualization adult_feces-table.qzv \
  --m-sample-metadata-file /data/tanzania_metadata.txt

# filter metadata in R
data <- read_csv("tanzania_metadata.csv")
data <- data %>% filter(data$host_life_stage %in% c("child", "adult"))
data$unique_name_type_season <- gsub("\\_.*", "", data$unique_name_type_season)
data$X1 <- NULL
data$season[data$season %in% c("2014-EW", "2014-LW")] <- "Wet14"
data$season[data$season %in% c("2013-ED", "2013-LD")] <- "Dry13"
data$season[data$season %in% c("2014-ED", "2014-LD")] <- "Dry14"
data <- data %>% filter(bush_camp %in% c("HUKAMAKO", "SENGELI"))

write.table(data, file='tanzania_metadata_season_camp_noinfant.tsv', quote=FALSE, sep='\t', row.names=FALSE)


#Filer medadata for all human fecal samples in order to conduct beta diversity analysis:
qiime feature-table filter-samples \
--i-table /data/denoise_150/table-dada2.qza \
--m-metadata-file /data/tanzania_metadata.txt \
--p-where "[sample_type]='feces' AND [host_common_name]='human'" \
--o-filtered-table human-fecal-3A-table.qza

#Visualize metadata by making a qzv file:
qiime feature-table summarize \
--i-table human-fecal-3A-table.qza \
--o-visualization human-fecal-3A-table.qzv \
--m-sample-metadata-file /data/tanzania_metadata.txt

#Metadata-based filtering to retain fecal samples from 2014 ED season
#Change working directory to Research_Aim_2
cd /data/Research_Aim_2

#First filtering step: retain only those samples which were collected during 2014 ED season and which have a documented water source
#For water source, exclude 'not collected'
qiime feature-table filter-samples \
--i-table /data/denoise_150/table-dada2.qza \
--m-metadata-file /data/tanzania_metadata.txt \
--p-where "[season]='2014-ED' AND NOT [water_source]='not_collected'" \
--o-filtered-table 2014_ED_filtered-table.qza  

#Second filtering step: within 2014_ED_filtered-table.qza, exclude those samples whose water source is 'not applicable'
qiime feature-table filter-samples \
--i-table 2014_ED_filtered-table.qza \
--m-metadata-file /data/tanzania_metadata.txt \
--p-where "[season]='2014-ED' AND NOT [water_source]='not_applicable'" \
--o-filtered-table 2014_ED_water_documented_filtered-table.qza  

#Visualize 2014_ED_water_documented_filtered-table.qza file
qiime feature-table summarize \
--i-table 2014_ED_water_documented_filtered-table.qza \
--o-visualization 2014_ED_water_documented_filtered-table.qzv \
--m-sample-metadata-file /data/tanzania_metadata.txt

#Metadata-based filtering to retain all skin swab samples
#Move into the Research_Aim_3B directory to begin working with the right hand data
cd Research_Aim_3B

#Filter out all samples from the metadata that are not human right hand samples
qiime feature-table filter-samples \
--i-table /data/denoise_150/table-dada2.qza \
--m-metadata-file /data/tanzania_metadata.txt \
--p-where "[sample_type_merged] ='HUMAN_R_HAND'" \
--o-filtered-table right-hand-filtered-table.qza

#Visualize the file to confirm that only right hand samples were kept
qiime feature-table summarize \
--i-table right-hand-filtered-table.qza \
--o-visualization right-hand-filtered-table.qzv \
--m-sample-metadata-file /data/tanzania_metadata.txt

#Phylogenetic analysis
#Generate a tree for phylogenetic diversity analyses

#Store these in the common directory
cd /data/

#Create unrooted and rooted trees
qiime phylogeny align-to-tree-mafft-fasttree \
  --i-sequences /data/denoise_150/rep-seqs-dada2_150.qza \
  --o-alignment aligned-rep-seqs.qza \
  --o-masked-alignment masked-aligned-rep-seqs.qza \
  --o-tree unrooted-tree.qza \
  --o-rooted-tree rooted-tree.qza

#Alpha rarefaction for longitudinal analysis
qiime diversity alpha-rarefaction \
  --i-table human_feces-table.qza \
  --i-phylogeny /data/rooted-tree.qza \
  --p-max-depth 20000 \
  --m-metadata-file /data/tanzania_metadata.txt \
  --o-visualization human_feces-alpha-rarefaction.qzv

# create a visualization of human-faces feature table to observe the number of lost samples
qiime feature-table summarize \
  --i-table human_feces-table.qza \
  --o-visualization human_feces-table.qzv \
  --m-sample-metadata-file /data/tanzania_metadata.txt

#Alpha rarefaction for all fecal samples 
#Create alpha rarefaction plot using filtered table
qiime diversity alpha-rarefaction \
--i-table human-fecal-3A-table.qza \
--i-phylogeny /data/rooted-tree.qza \
--p-max-depth 20000 \
--m-metadata-file /data/tanzania_metadata.txt \
--o-visualization alpha-rarefaction-3A.qzv

#Alpha rarefaction for fecal samples from 2014 ED season
#change working directory to Research_Aim_2
cd Research_Aim_2/

#create alpha rarefaction visualization
qiime diversity alpha-rarefaction \
  --i-table 2014_ED_water_documented_filtered-table.qza \
  --i-phylogeny /data/rooted-tree.qza \
  --p-max-depth 20000 \
  --m-metadata-file /data/tanzania_metadata.txt \
  --o-visualization alpha-rarefaction_20000.qzv

#Alpha rarefaction for all skin swab samples
#Alpha rarefaction curves now need to be plotted so that an appropriate sequence depth can be chosen for the right hand samples
#This will make it so that any samples with an insufficient amount of reads are not included in the beta diversity analyses
qiime diversity alpha-rarefaction \
--i-table right-hand-filtered-table.qza \
--i-phylogeny /data/rooted-tree.qza \
--p-max-depth 20000 \
--m-metadata-file /data/tanzania_metadata.txt \
--o-visualization right-hand-alpha-rarefaction.qzv

#The number of samples discarded at a specific sampling depth can then be visualized using the interactive sample detail of right-hand-filtered-table.qzv

#Seasonal pattern of adult and child fecal samples from 2 bush camps
# Analysis using filtered metadata and visualize PCoA plot for seasonal pattern in R
# filter feature table according to filtered metadata
qiime feature-table filter-samples \
  --i-table /data/Research_Aim_1/human_feces-table.qza \
  --m-metadata-file /data/Research_Aim_1/tanzania_metadata_season_camp_noinfant.tsv \
  --o-filtered-table filtered-table.qza
# calculate diversity metrics again with the filtered table
qiime diversity core-metrics-phylogenetic \
  --i-phylogeny /data/rooted-tree.qza \
  --i-table filtered-table.qza \
  --p-sampling-depth 6500 \
  --m-metadata-file /data/Research_Aim_1/tanzania_metadata_season_camp_noinfant.tsv \
  --output-dir core-metrics-results_filtered

# recalculate the significance between seasons
qiime diversity beta-group-significance \
  --i-distance-matrix /data/Research_Aim_1/core-metrics-results_filtered/unweighted_unifrac_distance_matrix.qza \
  --m-metadata-file /data/Research_Aim_1/tanzania_metadata_season_camp_noinfant.tsv \
  --m-metadata-column season \
  --o-visualization core-metrics-results_filtered/unweighted-unifrac-season-significance.qzv \
  --p-pairwise

# visualizing it in R
# Loading packages
library(tidyverse)
library(vegan)
library(phyloseq)
library(DESeq2)
library(ape)
# Importing all data files
biom_file <- import_biom("exported/table-with-taxonomy.biom")
metadata <- import_qiime_sample_data("tanzania_metadata_season_camp_noinfant.tsv")
tree <- read_tree_greengenes("exported/tree.nwk")
tree <- multi2di(tree) 
# Combine all objects into phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)
# Set set of random numbers
set.seed(711)
# Rename column names for taxonomic ranks
colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class",
                                 "Order", "Family", "Genus",
                                 "Species")
# Filter data based on metadata category with ==
feces <- subset_samples(physeq, sample_type == "feces" & host_common_name == "human")
# Exclude samples with less than 6500 reads
feces_6500 <- prune_samples(sample_sums(feces) >= 6500, feces)
# Beta diversity PCoA plot
physeq_rar <- rarefy_even_depth(feces_6500, sample.size = 6500)
# Convert to RA (relative abundance)
physeq_rar_RA <- transform_sample_counts(physeq_rar, function(x) x/sum(x))
ord <- ordinate(physeq_rar_RA, method = "PCoA",
                distance = "unifrac",
                weighted = FALSE)
# Plot data
plot_ordination(physeq_rar_RA,
                ord,
                type = "samples",
                color = "season",
                title = "PCoA (Unweighted Unifrac)") +
  scale_colour_manual(values = c("Dry13" = "#98D391", "Wet" = "#A480DD",
                                 "Dry14" = "#589152"),
                      labels = c("2013 Dry", "2014 Dry", "2014 Wet")) +
  stat_ellipse(type = "norm", size = 1, linetype = 2) + 
  guides(colour = guide_legend("Seasons")) + 
  theme_bw(base_size = 14) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  xlab("PCo1 (16.9%)") +
  ylab("PCo2 (7.4%)")

#Beta diversity analysis for all fecal samples
#Move into Research Aim 3A folder
cd /data/Research_Aim_3A

#Create alpha and beta diversity core metrics
qiime diversity core-metrics-phylogenetic \
--i-phylogeny /data/rooted-tree.qza \
--i-table human-fecal-3A-table.qza \
--p-sampling-depth 6000 \
--m-metadata-file /data/tanzania_metadata.txt \
--output-dir human-fecal-3A-core-metrics-results

#Create Beta diversity boxplots with PERMANOVA values

qiime diversity beta-group-significance \
--i-distance-matrix human-fecal-3A-core-metrics-results/unweighted_unifrac_distance_matrix.qza \ 
--m-metadata-file /data/tanzania_metadata.txt \
--m-metadata-column sex \
--o-visualization human-fecal-3A-core-metrics-results/unweighted-unifrac-sex-significance.qzv \
--p-pairwise

qiime diversity beta-group-significance \
--i-distance-matrix human-fecal-3A-core-metrics-results/weighted_unifrac_distance_matrix.qza \ 
--m-metadata-file /data/tanzania_metadata.txt \
--m-metadata-column sex \
--o-visualization human-fecal-3A-core-metrics-results/weighted-unifrac-sex-significance.qzv \
--p-pairwise

qiime diversity beta-group-significance \
--i-distance-matrix human-fecal-3A-core-metrics-results/bray_curtis_distance_matrix.qza \ 
--m-metadata-file /data/tanzania_metadata.txt \
--m-metadata-column sex \
--o-visualization human-fecal-3A-core-metrics-results/bray_curtis_sex_significance.qzv \
--p-pairwise

qiime diversity beta-group-significance \
--i-distance-matrix human-fecal-3A-core-metrics-results/jaccard_distance_matrix.qza \ 
--m-metadata-file /data/tanzania_metadata.txt \
--m-metadata-column sex \
--o-visualization human-fecal-3A-core-metrics-results/jaccard_sex_significance.qzv \
--p-pairwise

#Recreating PCoA plot of weighted UniFrac for all fecal samples in R
#Recreating graphs in R
#Move into Research Aim 3A directory 

#Export the data
qiime tools export \
--input-path human-fecal-3A-table.qza \
--output-path exported 

qiime tools export \
--input-path /data/taxonomy.qza \
--output-path exported

qiime tools export \
--input-path /data/rooted-tree.qza \
--output-path exported

#Use a text editor to change column names
nano exported/taxonomy.tsv

#Combine BIOM data with taxonomy
biom add-metadata \
-i exported/feature-table.biom \
-o exported/table-with-taxonomy.biom \
--observation-metadata-fp exported/taxonomy.tsv \
--sc-separated taxonomy

#Move all files to local computer and place in the R directory you will be using
#Follow R Script adapted from Stephan Koenig

# Loading packages
library(tidyverse)
library(vegan)
library(phyloseq)
library(DESeq2)

BiocManager::install("latticeExtra")
# Importing all data files
biom_file <- import_biom("table-with-taxonomy.biom")
metadata <- import_qiime_sample_data("tanzania_metadata.txt")
tree <- read_tree_greengenes("tree.nwk")


library(ape)

tree <- multi2di(tree)
  
# Combine all objects into phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)
# Overview of phyloseq object
physeq

# Use sample_data() to look at metadata of phyloseq object
# Look only at first 6 lines of table with head()
head(sample_data(physeq))

# Set set of random numbers
set.seed(711)

# taxonomic rank names
rank_names(physeq)
# Rename column names for taxonomic ranks
colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class",
                                 "Order", "Family", "Genus",
                                 "Species")
rank_names(physeq)

# format data as table with as.data.frame()
as.data.frame(sample_names(physeq))
# Show read count for each sample using sample_sums()
as.data.frame(sample_sums(physeq))


# Beta diversity PCoA plot
# Diversity requires rarefied taxa tables
# We know from Qiime that we should rarefy to  so right palm
# samples level out
physeq_rar <- rarefy_even_depth(physeq, sample.size = 6000)
  

# Convert to RA (relative abundance)
physeq_rar_RA <- transform_sample_counts(physeq_rar, function(x) x/sum(x))

# We define the type of analysis with ordinate() and setting the
# method = PCoA, setting distance to type of beta diversity analysis
# you can change analysis to jaccard, bray-curtis and so on
ord <- ordinate(physeq_rar_RA, method = "PCoA",
                distance = "wunifrac")

# Plot data with labels and colour
plot_ordination(physeq_rar_RA,
                ord,
                type = "sample_type_merged",
                color = "sex",
                title = "PCoA (Weighted Unifrac)") +
  annotate(geom = "text",
           label = "Fecal Samples",
           x= -0.07,
           y= 0.06) +
  theme_bw()+
  theme(legend.text = element_text(size=10))+
  guides(shape= guide_legend(override.aes = list(size=4)))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

#Beta diversity analysis for fecal samples from 2014 ED season
#change directory to Research_Aim_2
cd /data/Research_Aim_2/

#alpha and beta diversity analysis
qiime diversity core-metrics-phylogenetic \
  --i-phylogeny /data/rooted-tree.qza \
  --i-table 2014_ED_water_documented_filtered-table.qza \
  --p-sampling-depth 12000 \
  --m-metadata-file /data/tanzania_metadata.txt \
  --output-dir core-metrics-results

#Beta diversity group significance according to bush camp
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results/weighted_unifrac_distance_matrix.qza \
  --m-metadata-file /data/tanzania_metadata.txt \
  --m-metadata-column bush_camp \
  --o-visualization core-metrics-results/weighted-unifrac-bushcamp-group-significance.qzv \
  --p-pairwise

#Recreate weighted UniFrac plot for fecal samples from 2014 ED season in R
#Exporting biom data from qiime
#Exporting filtered table.qza
qiime tools export \  
--input-path 2014_ED_water_documented_filtered-table.qza \  
--output-path exported
#Exporting taxonomy.qza
qiime tools export \  
--input-path /data/taxonomy.qza \  
--output-path exported
#Exporting rooted-tree.qza
qiime tools export \
--input-path /data/rooted-tree.qza \
--output-path exported
#Open file in a text editor
#Combine taxonomy with BIOM data
biom add-metadata \
-i exported/feature-table.biom \
-o exported/table-with-taxonomy.biom \
--observation-metadata-fp exported/taxonomy.tsv \
--sc-separated taxonomy

#Import table-with-taxonomy.biom and tree.nwk to local computer
scp root@10.19.139.153:/data/Research_Aim_2/exported/table-with-taxonomy.biom .
scp root@10.19.139.153:/data/Research_Aim_2/exported/tree.nwk .
scp root@10.19.139.153:/data/tanzania_metadata.txt .

#R script for PCoA plot and taxa bar plot
#Trying to make a pretty PCoA plot
#Loading packages
library(tidyverse)
library(vegan)
library(phyloseq)
library(DESeq2)

# Helper functions
gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# Importing all data files
biom_file <- import_biom("table-with-taxonomy.biom")
metadata <- import_qiime_sample_data("tanzania_metadata.txt")
tree <- read_tree_greengenes("tree.nwk")
library(ape)
tree<-multi2di(tree)

# Combine all objects into phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)
#Overview of phyloseq object
physeq

# Set set of random numbers
set.seed(711)

# taxonomic rank names
rank_names(physeq)
# Rename column names for taxonomic ranks
colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class",
                                 "Order", "Family", "Genus",
                                 "Species")

rank_names(physeq)

# Use sample_data() to look at metadata of phyloseq object
# Look only at first 6 lines of table with head()
sample_data(physeq)

#Filter out all water samples
no_water<-subset_samples(physeq, sample_type_merged != "ENVIRONMENTAL_WATER")

sample_data(no_water)

# format data as table with as.data.frame()
as.data.frame(sample_names(no_water))
# Show read count for each sample using sample_sums()
as.data.frame(sample_sums(no_water))

# Beta diversity PCoA plot
# Diversity requires rarefied taxa tables
# We know from Qiime that we should rarefy to 12000
physeq_rar <- rarefy_even_depth(no_water, sample.size = 12000)

# Convert to RA (relative abundance)
physeq_rar_RA <- transform_sample_counts(physeq_rar, function(x) x/sum(x))

# We define the type of analysis with ordinate() and setting the
# method = PCoA, setting distance to type of beta diversity analysis
# you can change analysis to jaccard, bray-curtis and so on
ord <- ordinate(physeq_rar_RA, method = "PCoA",
                distance = "wunifrac")

# Plot data
plot_ordination(physeq_rar_RA,
                ord,
                type = "sample",
                color = "bush_camp",
                title = "PCoA (Weighted Unifrac)") +
  guides(colour = guide_legend("Bush Camp")) +
  # Adding text to plot
  annotate(geom = "text",
           label = "Fecal Samples 2014 ED",
           x = -0.2,
           y = -0.55,
           size = 4) + theme_bw()

#Beta diversity analysis of all skin swab samples
#Move into the Research_Aim_3B directory
cd Research_Aim_3B

#Perform diversity analysis on the right hand microbiome data
#A sampling depth of 8070 will be used, as determined by performing alpha rarefaction
qiime diversity core-metrics-phylogenetic \
  --i-phylogeny /data/rooted-tree.qza \
  --i-table right-hand-filtered-table.qza \
  --p-sampling-depth 8070 \
  --m-metadata-file /data/tanzania_metadata.txt \
  --output-dir right-hand-core-metrics-results

#Generate beta diversity box and whisker plots, along with PERMANOVA to compare male and female right hand samples.
qiime diversity beta-group-significance \
  --i-distance-matrix right-hand-core-metrics-results/unweighted_unifrac_distance_matrix.qza \
  --m-metadata-file /data/tanzania_metadata.txt \
  --m-metadata-column sex \
  --o-visualization right-hand-core-metrics-results/unweighted-unifrac-sex-significance.qzv \
  --p-pairwise

qiime diversity beta-group-significance \
  --i-distance-matrix right-hand-core-metrics-results/weighted_unifrac_distance_matrix.qza \
  --m-metadata-file /data/tanzania_metadata.txt \
  --m-metadata-column sex \
  --o-visualization right-hand-core-metrics-results/weighted-unifrac-sex-significance.qzv \
  --p-pairwise

qiime diversity beta-group-significance \
  --i-distance-matrix right-hand-core-metrics-results/bray_curtis_distance_matrix.qza \
  --m-metadata-file /data/tanzania_metadata.txt \
  --m-metadata-column sex \
  --o-visualization right-hand-core-metrics-results/bray-curtis-sex-significance.qzv \
  --p-pairwise

qiime diversity beta-group-significance \
  --i-distance-matrix right-hand-core-metrics-results/jaccard_distance_matrix.qza \
  --m-metadata-file /data/tanzania_metadata.txt \
  --m-metadata-column sex \
  --o-visualization right-hand-core-metrics-results/jaccard-sex-significance.qzv \
  --p-pairwise

#Remake desired plot (Weighted Unifrac) in R for all skin swab samples

#Export the data
qiime tools export \
--input-path right-hand-filtered-table.qza \
--output-path exported

qiime tools export \
--input-path /data/taxonomy.qza \
--output-path exported

qiime tools export \
--input-path /data/rooted-tree.qza \
--output-path exported

#Use a text editor to change column names
nano exported/taxonomy.tsv 

#Change "Feature ID" to "#OTUID"
#Change "Taxon" to "taxonomy"
#Change "Condifence" to "confidence"

#Combine the BIOM data with taxonomy
biom add-metadata \
-i exported/feature-table.biom \
-o exported/table-with-taxonomy.biom \
--observation-metadata-fp exported/taxonomy.tsv \
--sc-separated taxonomy

#Move all exported files to the local computer and open R

# Loading packages
library(tidyverse)
library(vegan)
library(phyloseq)
library(DESeq2)

# Helper functions
gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# Import all data files
biom_file <- import_biom("table-with-taxonomy.biom")
metadata <- import_qiime_sample_data("tanzania_metadata.txt")
tree <- read_tree_greengenes("tree.nwk")

library(ape)

tree <- multi2di(tree)

# Combine all objects into a phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)
# Overview of phyloseq object
physeq

# Use sample_data() to look at metadata of phyloseq object
# Look only at first 6 lines of table with head()
head(sample_data(physeq))

# Create set of random numbers
set.seed(711)

# taxonomic rank names
rank_names(physeq)
# Rename column names for taxonomic ranks
colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class",
                                 "Order", "Family", "Genus",
                                 "Species")
rank_names(physeq)

# format data as table with as.data.frame()
as.data.frame(sample_names(physeq))
# Show read count for each sample using sample_sums()
as.data.frame(sample_sums(physeq))

# Create a beta diversity PCoA plot
# Diversity requires rarefied taxa tables
# We know from Qiime2 that we should rarefy to 8070
physeq_rar <- rarefy_even_depth(physeq, sample.size = 8070)
  

# Convert to RA (relative abundance)
physeq_rar_RA <- transform_sample_counts(physeq_rar, function(x) x/sum(x))

# Define the type of analysis with ordinate() and set the method = PCoA, with distance equating to the type of beta diversity analysis
ord <- ordinate(physeq_rar_RA, method = "PCoA",
                distance = "wunifrac")

# Plot data
plot_ordination(physeq_rar_RA,
                ord,
                type = "sample_type_merged",
                color = "sex",
                title = "PCoA (Weighted Unifrac)") +
  # Add text to the plot and clean up the figure
  annotate(geom = "text",
           label = "Right Hand Samples",
           x = - 0.08,
           y = 0.07,
           size = 5)+
  geom_point(aes(shape=bush_camp, size=2), show.legend=F)+
  scale_shape_manual(name="bush camp", values=c(7,9,15,17,18,19))+
  theme_bw()+
  theme(legend.text = element_text(size=10))+
  guides(shape= guide_legend(override.aes = list(size=4)))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()

#Training a taxonomic classifier
#Train taxonomic classifier to perform taxonomic analysis
#Download and unzip the Greengenes 99% database
wget "ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz"

gunzip gg_13_8_otus.tar.gz

tar -xvf gg_13_8_otus.tar

qiime tools import \
--type 'FeatureData[Sequence]' \
--input-path gg_13_8_otus/rep_set/99_otus.fasta \
--output-path ref-otus.qza

qiime tools import \
--type 'FeatureData[Taxonomy]' \
--input-format HeaderlessTSVTaxonomyFormat \
--input-path gg_13_8_otus/taxonomy/99_otu_taxonomy.txt \
--output-path ref-taxonomy.qza

#Convert database containg full-length 16S reference sequences to one that contains same length sequences from 16S regions sequenced in the Smits et al. paper
#Smits et al. used 515F and 806R primers

qiime feature-classifier extract-reads \
--i-sequences ref-otus.qza \
--p-f-primer GTGCCAGCMGCCGCGGTAA \
--p-r-primer GGACTACNVGGGTWTCTAAT \
--p-trunc-len 150 \
--o-reads ref-seqs.qza

#Train the classifier using reference reads
qiime feature-classifier fit-classifier-naive-bayes \
--i-reference-reads ref-seqs.qza \
--i-reference-taxonomy ref-taxonomy.qza \
--o-classifier classifier.qza

#Use the newly trained classifier to conduct taxonomic analysis
qiime feature-classifier classify-sklearn \
--i-classifier classifier.qza \
--i-reads denoise_150/rep-seqs-dada2_150.qza \
--o-classification taxonomy.qza

qiime metadata tabulate \
--m-input-file taxonomy.qza \
--o-visualization taxonomy.qzv

#Make taxa bar plot for water samples from 2014 ED season in R
#Loading packages
library(tidyverse)
library(vegan)
library(phyloseq)
library(DESeq2)

# Helper functions
gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# Importing all data files
biom_file <- import_biom("table-with-taxonomy.biom")
metadata <- import_qiime_sample_data("tanzania_metadata.txt")
tree <- read_tree_greengenes("tree.nwk")
library(ape)
tree<-multi2di(tree)

# Combine all objects into phyloseq object
physeq <- merge_phyloseq(biom_file, metadata, tree)
#Overview of phyloseq object
physeq

# Set set of random numbers
set.seed(711)

# taxonomic rank names
rank_names(physeq)
# Rename column names for taxonomic ranks
colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class",
                                 "Order", "Family", "Genus",
                                 "Species")

rank_names(physeq)

# Use sample_data() to look at metadata of phyloseq object
# Look only at first 6 lines of table with head()
sample_data(physeq)
#Trying to make a taxa bar plot
physeq
sample_data(physeq)
#transform into relative abundances
physeq2<-transform_sample_counts(physeq, function(x) x / sum(x) )
physeq2
#choose only fecal samples
water<-subset_samples(physeq2, sample_type_merged == "ENVIRONMENTAL_WATER")
water
#create dataframe from phyloseq object
wat<-psmelt(water)
#turn all OTUs into family counts
glom <- tax_glom(water, taxrank = "Family")
glom # should list # taxa as # phyla
data <- psmelt(glom) # create dataframe from phyloseq object
data$Family <- as.character(data$Family) #convert to character

#simple way to rename phyla with < 2.5% abundance
data$Family[data$Abundance < 0.025] <- "< 2.5% abund."

#Adding factor levels to env_feature and changing the values
levels(data$env_feature) <- c(levels(data$env_feature), "Dry Riverbed", "Stream", "Well")
data$env_feature[data$env_feature == "dry river"] <- "Dry Riverbed"
data$env_feature[data$env_feature == "stream"] <- "Stream"
data$env_feature[data$env_feature == "water well"] <- "Well"


#plot with condensed phyla into "< 2.5% abund" category
p <- ggplot(data=data, aes(x=env_feature, y=Abundance, fill=Family))+
  geom_bar(aes(), stat="identity", position="fill") + xlab("Water Source") +
  ylab("Relative Abundance") + scale_fill_manual(values = c("darkblue", "darkgoldenrod1",
    "darkseagreen", "darkorchid", "darkolivegreen1", "lightskyblue", "darkgreen",
    "deeppink", "khaki2", "firebrick", "brown1", "darkorange1", "cyan1")) + theme(legend.position = 
                                                                                    "right")

#Differential abundance analysis for all skin swab samples between males and females
#Filter out low frequency ASVs prior to differential abundance testing.
#Total number of reads= 2132568.
#2132568 x 0.00005 = 106.6, therefore we will set the p-min frequency to 107.

qiime feature-table filter-features \
--i-table right-hand-filtered-table.qza \
--p-min-frequency 107 \
--o-filtered-table right-hand-filtered-table-2.qza

#Filter out non-bacterial sequences prior to differential abundance testing.
qiime taxa filter-table \
--i-table right-hand-filtered-table-2.qza \
--i-taxonomy /data/taxonomy.qza \
--p-exclude mitochondia,chloroplast,archaea \
--o-filtered-table right-hand-filtered-table-3.qza

#Perform differential abundance testing using ANCOM to find features that are differentially abundant between males and females.
qiime composition add-pseudocount \
--i-table right-hand-filtered-table-3.qza \
--o-composition-table comp-right-hand-filtered-table.qza

qiime composition ancom \
--i-table comp-right-hand-filtered-table.qza \
--m-metadata-file /data/tanzania_metadata.txt \
--m-metadata-column sex \
--o-visualization ancom-right-hand-sex.qzv

#Perform the analysis at a specific taxonomic level (at the genus level here). 

qiime taxa collapse \
--i-table right-hand-filtered-table-3.qza \
--i-taxonomy /data/taxonomy.qza \
--p-level 6 \
--o-collapsed-table right-hand-table-lvl6.qza

qiime composition add-pseudocount \
--i-table right-hand-table-lvl6.qza \
--o-composition-table comp-right-hand-table-lvl6.qza

qiime composition ancom \
--i-table comp-right-hand-table-lvl6.qza \
--m-metadata-file /data/tanzania_metadata.txt \
--m-metadata-column sex \
--o-visualization ancom-right-hand-sex-lvl6.qzv

#Differential abundance analysis of fecal samples from 2014 ED season according to bush camp
#Differential abundance analysis for human fecal samples according to bush camps
#Need more than one fecal sample per bush camp
#need to filter out environmental samples first
qiime feature-table filter-samples \
  --i-table 2014_ED_water_documented_filtered-table.qza \
  --m-metadata-file /data/tanzania_metadata.txt \
  --p-where "[sample_type_merged]='HUMAN_FECES'" \
  --o-filtered-table 2014_ED_feces_documented_water_table.qza

#Filter out Hukamako sample since there's only one
qiime feature-table filter-samples \
  --i-table 2014_ED_feces_documented_water_table.qza \
  --m-metadata-file /data/tanzania_metadata.txt \
  --p-where "[bush_camp] IN ('BENJA', 'ENDAMANGA_SCHOOL', 'EZEKILI', 'GIDAMILANDA', 'MSAFIRI', 'MWAMUDU', 'SAITOTI', 'SENGELI')" \
  --o-filtered-table bush_camp_filtered_no_hukamako_table.qza

#filter out low frequency ASVs
qiime feature-table filter-features \
  --i-table bush_camp_filtered_no_hukamako_table.qza \
  --p-min-frequency 2 \
  --o-filtered-table bushcamp_nohukamako_frequency-filtered-table.qza
#filter out non-bacterial sequences
qiime taxa filter-table \
  --i-table bushcamp_nohukamako_frequency-filtered-table.qza \
  --i-taxonomy /data/taxonomy.qza \
  --p-exclude mitochondria,chloroplast,archea \
  --o-filtered-table bushcamp_nohukamako_frequency-filtered-table.qza

#add pseudocounts
qiime composition add-pseudocount \
  --i-table bushcamp_nohukamako_frequency-filtered-table.qza \
  --o-composition-table comp_bush_camp_table.qza

#run ANCOM on the bush_camp column
qiime composition ancom \
  --i-table comp_bush_camp_table.qza \
  --m-metadata-file /data/tanzania_metadata.txt \
  --m-metadata-column bush_camp \
  --o-visualization ancom_bush_camp.qzv

#collapse features to the family level
qiime taxa collapse \
  --i-table bushcamp_nohukamako_frequency-filtered-table.qza \
  --i-taxonomy /data/taxonomy.qza \
  --p-level 5 \
  --o-collapsed-table bush_camp_table_l5.qza

qiime composition add-pseudocount \
  --i-table bush_camp_table_l5.qza \
  --o-composition-table comp_bush_camp_table_l5.qza

qiime composition ancom \
  --i-table comp_bush_camp_table_l5.qza \
  --m-metadata-file /data/tanzania_metadata.txt \
  --m-metadata-column bush_camp \
  --o-visualization l5_ancom_bush_camp.qzv

#Differential abundance analysis of all fecal samples by sex
#Filter out low frequency ASV
#Total number of reads = 12,115,030 -> 12115030 x 0.00005 =605.75 -> will set p-min at 6060

qiime feature-table filter-features \
--i-table human-fecal-3A-table.qza \
--p-min-frequency 606 \
--o-filtered-table human-fecal-3A-table-2.qza

#Filter out non-bacterial sequences prior to differential abundance tests
qiime taxa filter-table \
--i-table human-fecal-3A-table-2.qza \
--i-taxonomy /data/taxonomy.qza \ 
--p-exclude mitochondria,chloroplast,archaea \
--o-filtered-table human-fecal-3A-table-3.qza

#Perform differential abundance testing using ANCOM
qiime composition add-pseudocount \
--i-table human-fecal-3A-table-3.qza \
--o-composition-table comp-fecal-3A-table.qza

qiime composition ancom \
--i-table comp-fecal-3A-table.qza \
--m-metadata-file /data/tanzania_metadata.txt \
--m-metadata-column sex \
--o-visualization ancom-fecal-sex.qzv

#Repeat ANCOM but only at the genus level (level 6)
qiime taxa collapse \
--i-table human-fecal-3A-table-3.qza \
--i-taxonomy /data/taxonomy.qza \
--p-level 6 \
--o-collapsed-table human-fecal-3A-table-lvl6.qza

qiime composition add-pseudocount \
--i-table human-fecal-3A-table-lvl6.qza \
--o-composition-table comp-fecal-3A-table-lvl6.qza

qiime composition ancom \
--i-table comp-fecal-3A-table-lvl6.qza \
--m-metadata-file /data/tanzania_metadata.txt \
--m-metadata-column sex \
--o-visualization ancom-fecal-sex-lvl6.qzv

#Longitudinal analysis
# Longitudinal analysis
# add taxonomy data to .biom file
biom add-metadata -i feature-table.biom -o table-with-taxonomy-filtered.biom -m /data/Research_Aim_1/tanzania_metadata_season_camp_noinfant.tsv --observation-metadata-fp taxonomy.tsv --sc-separated taxonomy
## dry season 
qiime longitudinal pairwise-distances \
  --i-distance-matrix /data/Research_Aim_1/core-metrics-results_filtered/unweighted_unifrac_distance_matrix.qza \
  --m-metadata-file /data/Research_Aim_1/tanzania_metadata_season_camp_noinfant.tsv \
  --p-group-column host_life_stage \
  --p-state-column season \
  --p-state-1 Dry13 \
  --p-state-2 Dry14 \
  --p-individual-id-column unique_name_type_season \
  --p-replicate-handling random \
  --o-visualization dry-season-pairwise-differences_filtered.qzv

## wet to dry 2013, following the figure from the original paper. The wet seasons has been combined
qiime longitudinal pairwise-distances \
  --i-distance-matrix /data/Research_Aim_1/core-metrics-results_filtered/unweighted_unifrac_distance_matrix.qza \
  --m-metadata-file /data/Research_Aim_1/tanzania_metadata_season_camp_noinfant.tsv \
  --p-group-column host_life_stage \
  --p-state-column season \
  --p-state-1 Dry13 \
  --p-state-2 Wet14 \
  --p-individual-id-column unique_name_type_season \
  --p-replicate-handling random \
  --o-visualization wet13-season-pairwise-differences_filtered.qzv

## wet 2014 to dry 2014, following the figure from the original paper
qiime longitudinal pairwise-distances \
  --i-distance-matrix /data/Research_Aim_1/core-metrics-results_filtered/unweighted_unifrac_distance_matrix.qza \
  --m-metadata-file /data/Research_Aim_1/tanzania_metadata_season_camp_noinfant.tsv \
  --p-group-column host_life_stage \
  --p-state-column season \
  --p-state-1 Dry14 \
  --p-state-2 Wet14 \
  --p-individual-id-column unique_name_type_season \
  --p-replicate-handling random \
  --o-visualization wet14-season-pairwise-differences_filtered.qzv