<Script1.sh>
# Create and navigate to directory for project
mkdir /data/infant
cd /data/infant

# Import demultiplexed data into project directory
qiime tools import \
--type 'SampleData[SequencesWithQuality]' \
--input-path /mnt/datasets/project_2/infant/infant_manifest.txt \
--output-path demux.qza \
--input-format SingleEndFastqManifestPhred33V2

# Visualization of demultiplexed samples
qiime demux summarize \
--i-data demux.qza \
--o-visualization demux.qzv

# Sequence quality control and determine ASV with DADA2
qiime dada2 denoise-single \
--i-demultiplexed-seqs demux.qza \
--p-trim-left 0 \
--p-trunc-len 150 \
--o-representative-sequences rep-seqs.qza \
--o-table table-dada2.qza \
--o-denoising-stats stats-dada2.qza

# Create visualization of DADA2 stats
qiime metadata tabulate \
--m-input-file stats-dada2.qza \
--o-visualization stats-dada2.qzv

# Generate feature table
qiime feature-table summarize \
--i-table table.qza \
--o-visualization table.qzv

# Map feature IDs to sequences
qiime feature-table tabulate-seqs \
--i-data rep-seqs.qza \
--o-visualization rep-seqs.qzv

# Generate tree for phylogenetic diversity analyses
qiime phylogeny align-to-tree-mafft-fasttree \
--i-sequences rep-seqs.qza \
--o-alignment aligned-rep-seqs.qza \
--o-masked-alignment masked-aligned-rep-seqs.qza \
--o-tree unrooted-tree.qza \
--o-rooted-tree rooted-tree.qza

# Alpha-rarefaction to correct sampling depth
qiime diversity alpha-rarefaction \
 --i-table table.qza \
 --i-phylogeny rooted-tree.qza \
 --p-max-depth 37000 \
 --m-metadata-file infant_metadata.txt \
 --o-visualization alpha-rarefaction.qzv

# Calculate alpha- and beta-diversity metrics with 4250 sampling depth
qiime diversity core-metrics-phylogenetic \
 --i-phylogeny rooted-tree.qza \
 --i-table table.qza \
 --p-sampling-depth 4250 \
 --m-metadata-file infant_metadata.txt \
 --output-dir core_metrics_results_4250_depth

# Download pre-trained Naive Bayes classifier trained
# on Greengenes 13_8 99% OTUs
wget "https://data.qiime2.org/2020.8/common/gg-13-8-99-515-806-nb-classifier.qza"

# Generate taxonomy.qza with QIIME
qiime feature-classifier classify-sklearn 
 --i-classifier gg-13-8-99-515-806-nb-classifier.qza \
 --i-reads rep-seqs.qza \
 --o-classification taxonomy.qza

# Export BIOM data from QIIME
qiime tools export \
 --input-path table.qza \
 --output-path exported

qiime tools export \
 --input-path taxonomy.qza \
 --output-path exported

qiime tools export \
 --input-path rooted-tree.qza \
 --output-path exported

# Edit the column names and change
# `Feature ID` to `#OTUID`
# `Taxon` to `taxonomy`
# `Confidence` to `confidence`
nano exported/taxonomy.tsv

# Combine taxonomy with BIOM data
biom add-metadata \
 -i exported/feature-table.biom \
 -o exported/table-with-taxonomy.biom \
 --observation-metadata-fp exported/taxonomy.tsv \
 --sc-separated taxonomy
 
 
 
 
 
 
<Script2.r>
##################################################################
#MICB447 Project 2
#Author: Enoch Yau, Team 2
#This script used to answer questions regarding mode of delivery (behavioral and microbial differences)
#####################################################################
#loading required packages
library(ggplot2)
library(tidyverse)

############################################
#data setup in this section

#initial importing of data
infant_metadata <- readxl::read_xlsx("infant_metadata.xlsx")

#filter to only get metadata variables of interest from infants whose birth method is known
#don't keep "enjoyment_food", "general_appetite", or "satiety_responsive", since these don't appear to be collected at the same timepoint
infant_metadata_filtered <- subset(infant_metadata, life_stage=="Infant" & baby_delivery!="not collected")
infant_metadata_filtered <- select(infant_metadata_filtered, 
                                   c("#SampleID", "life_stage", "age_category",
                                     "anonymized_name", "baby_delivery","enjoyment_food_2_months",
                                     "enjoyment_food_2_weeks","enjoyment_food_4_months","enjoyment_food_6_months",
                                     "feed","general_appetite_2_months","general_appetite_2_weeks",
                                     "general_appetite_4_months","general_appetite_6_months",
                                     "satiety_responsive_2_months","satiety_responsive_2_weeks",
                                     "satiety_responsive_4_months","satiety_responsive_6_months",
                                     "wlz_2w", "wlz_2m", "wlz_4m", "wlz_6m","wlz_9m", "wlz_12m"))

#convert feeding behavior metadata columns to numerical form, replacing "not collected" with NA (default setting)
numerical_data <- c("enjoyment_food_2_months",
                    "enjoyment_food_2_weeks", "enjoyment_food_4_months",
                    "enjoyment_food_6_months","general_appetite_2_months",
                    "general_appetite_2_weeks","general_appetite_4_months",
                    "general_appetite_6_months", "satiety_responsive_2_months",
                    "satiety_responsive_2_weeks","satiety_responsive_4_months",
                    "satiety_responsive_6_months","wlz_2w", "wlz_2m", "wlz_4m", "wlz_6m","wlz_9m", "wlz_12m")
infant_metadata_filtered[numerical_data]<- lapply(infant_metadata_filtered[numerical_data], as.numeric)

#remove duplicated entries (entries with the same anonymized name)
infant_metadata_filtered_no_duplicates <- infant_metadata_filtered[!duplicated(infant_metadata_filtered$anonymized_name),]

#rearrange data so that a time variable is introduced into the frame and all the feeding behavior values are consolidated into 1 column
#should make it easier to plot line graphs this way
#create a data frame for each timepoint first
FeedBhvr_2weeks <- data.frame("anonymized_name"=infant_metadata_filtered_no_duplicates$anonymized_name,
                              "baby_delivery"=infant_metadata_filtered_no_duplicates$baby_delivery,
                              "Time"="2 weeks",
                              "general_appetite"=infant_metadata_filtered_no_duplicates$general_appetite_2_weeks,
                              "enjoyment_food"=infant_metadata_filtered_no_duplicates$enjoyment_food_2_weeks,
                              "satiety_responsive"=infant_metadata_filtered_no_duplicates$satiety_responsive_2_weeks,
                              "wlz"=infant_metadata_filtered_no_duplicates$wlz_2w)
FeedBhvr_2months <- data.frame("anonymized_name"=infant_metadata_filtered_no_duplicates$anonymized_name,
                               "baby_delivery"=infant_metadata_filtered_no_duplicates$baby_delivery,
                               "Time"="2 months",
                               "general_appetite"=infant_metadata_filtered_no_duplicates$general_appetite_2_months,
                               "enjoyment_food"=infant_metadata_filtered_no_duplicates$enjoyment_food_2_months,
                               "satiety_responsive"=infant_metadata_filtered_no_duplicates$satiety_responsive_2_months,
                               "wlz"=infant_metadata_filtered_no_duplicates$wlz_2m)
FeedBhvr_4months <- data.frame("anonymized_name"=infant_metadata_filtered_no_duplicates$anonymized_name,
                               "baby_delivery"=infant_metadata_filtered_no_duplicates$baby_delivery,
                               "Time"="4 months",
                               "general_appetite"=infant_metadata_filtered_no_duplicates$general_appetite_4_months,
                               "enjoyment_food"=infant_metadata_filtered_no_duplicates$enjoyment_food_4_months,
                               "satiety_responsive"=infant_metadata_filtered_no_duplicates$satiety_responsive_4_months,
                               "wlz"=infant_metadata_filtered_no_duplicates$wlz_4m)
FeedBhvr_6months <- data.frame("anonymized_name"=infant_metadata_filtered_no_duplicates$anonymized_name,
                               "baby_delivery"=infant_metadata_filtered_no_duplicates$baby_delivery,
                               "Time"="6 months",
                               "general_appetite"=infant_metadata_filtered_no_duplicates$general_appetite_6_months,
                               "enjoyment_food"=infant_metadata_filtered_no_duplicates$enjoyment_food_6_months,
                               "satiety_responsive"=infant_metadata_filtered_no_duplicates$satiety_responsive_6_months,
                               "wlz"=infant_metadata_filtered_no_duplicates$wlz_6m)
FeedBhvr_9months <- data.frame("anonymized_name"=infant_metadata_filtered_no_duplicates$anonymized_name,
                               "baby_delivery"=infant_metadata_filtered_no_duplicates$baby_delivery,
                               "Time"="9 months", "general_appetite"=NA, "enjoyment_food"=NA, "satiety_responsive"=NA,
                               "wlz"=infant_metadata_filtered_no_duplicates$wlz_9m)
FeedBhvr_12months <- data.frame("anonymized_name"=infant_metadata_filtered_no_duplicates$anonymized_name,
                                "baby_delivery"=infant_metadata_filtered_no_duplicates$baby_delivery,
                                "Time"="12 months", "general_appetite"=NA, "enjoyment_food"=NA, "satiety_responsive"=NA,
                                "wlz"=infant_metadata_filtered_no_duplicates$wlz_12m)
#combine dataframes above into one
FeedBhvr_Trend <- rbind(FeedBhvr_2weeks, FeedBhvr_2months, FeedBhvr_4months, FeedBhvr_6months, FeedBhvr_9months, FeedBhvr_12months)

#set the data frame so that time is properly ordered (instead of the default alphanumeric order)
FeedBhvr_Trend$Time <- factor(FeedBhvr_Trend$Time, c("2 weeks", "2 months", "4 months", "6 months", "9 months", "12 months"))


################################################################################
#Setup for and plotting data in this section

#creating functions to calculate t test p values in a more concise way than a code line for each t test
#functions will also make exporting p values into a dataframe for importing into figures easier
#function description: determine p value of t-test to 3 digits for respective metadata category between baby_delivery methods at time x
GA_pvalue_calc <- function(x){round(t.test(general_appetite~baby_delivery, FeedBhvr_Trend[FeedBhvr_Trend$Time==x,])$p.value, digits=3)}
EF_pvalue_calc <- function(x){round(t.test(enjoyment_food~baby_delivery, FeedBhvr_Trend[FeedBhvr_Trend$Time==x,])$p.value, digits=3)}
SR_pvalue_calc <- function(x){round(t.test(satiety_responsive~baby_delivery, FeedBhvr_Trend[FeedBhvr_Trend$Time==x,])$p.value, digits=3)}

#creating vector of timepoints so i can run the function at each time using sapply
timepoints=c("2 weeks", "2 months", "4 months", "6 months")

#making dataframe containing all p values from t-tests so i can add values to timewise plot
pvalue_results <- data.frame("Time"=c(timepoints), EFpvalue=sapply(timepoints, EF_pvalue_calc),
                             GApvalue=sapply(timepoints, GA_pvalue_calc), SRpvalue=sapply(timepoints, SR_pvalue_calc))

#BH correction for p values
pvalue_adj <- data.frame("Time"=c(timepoints), EFpvalue=round(p.adjust(pvalue_results$EFpvalue, method="BH"), digits=3), 
                         GApvalue=round(p.adjust(pvalue_results$GApvalue, method="BH"), digits=3),
                         SRpvalue=round(p.adjust(pvalue_results$SRpvalue, method="BH"), digits=3))

#General Appetite line graph
ggplot(FeedBhvr_Trend,aes(x=Time)) +
  stat_summary(aes(y=general_appetite, colour=baby_delivery, group=baby_delivery), fun=mean, geom="line") +
  stat_summary(aes(y=general_appetite, shape=baby_delivery, colour=baby_delivery), fun=mean, geom="point", size=3) +
  scale_shape_manual(values=c(15,17)) +
  stat_summary(aes(y=general_appetite, colour=baby_delivery), fun.data=mean_se, geom="errorbar", width=0.1) +
  scale_colour_manual(values=c("blue", "red")) +
  theme_classic() +
  labs(x="Infant Age",y="General Appetite", shape="Mode of Delivery", colour="Mode of Delivery") +
  theme(plot.title = element_text(hjust=0.5)) +
  coord_cartesian(ylim=c(3.5,4.7)) +
  scale_x_discrete(limits=c("2 weeks", "2 months", "4 months", "6 months")) +
  geom_text(data=pvalue_adj, aes(x=Time,y=3.55, label=paste("p=",GApvalue))) +
  theme(legend.position=c(0.16, 0.87), legend.box.background = element_rect(size=1.2)) +
  theme(text=element_text(size=17))

#Food Enjoyment line graph
ggplot(FeedBhvr_Trend,aes(x=Time)) +
  stat_summary(aes(y=enjoyment_food, colour=baby_delivery, group=baby_delivery), fun=mean, geom="line") +
  stat_summary(aes(y=enjoyment_food, shape=baby_delivery, colour=baby_delivery), fun=mean, geom="point", size=3) +
  scale_shape_manual(values=c(15,17)) +
  stat_summary(aes(y=enjoyment_food, colour=baby_delivery), fun.data=mean_se, geom="errorbar", width=0.1) +
  scale_colour_manual(values=c("blue", "red")) +
  theme_classic() +
  labs(x="Infant Age",y="Food Enjoyment", shape="Mode of Delivery", colour="Mode of Delivery") +
  theme(plot.title = element_text(hjust=0.5)) +
  coord_cartesian(ylim=c(4.2,4.9)) +
  scale_x_discrete(limits=c("2 weeks", "2 months", "4 months", "6 months")) +
  geom_text(data=pvalue_adj, aes(x=Time,y=4.2, label=paste("p=",EFpvalue))) +
  theme(legend.position=c(0.85, 0.85), legend.box.background = element_rect(size=1.2)) +
  theme(text=element_text(size=17))

#Satiety line graph
ggplot(FeedBhvr_Trend,aes(x=Time)) +
  stat_summary(aes(y=satiety_responsive, colour=baby_delivery, group=baby_delivery), fun=mean, geom="line") +
  stat_summary(aes(y=satiety_responsive, shape=baby_delivery, colour=baby_delivery), fun=mean, geom="point", size=3) +
  scale_shape_manual(values=c(15,17)) +
  stat_summary(aes(y=satiety_responsive, colour=baby_delivery), fun.data=mean_se, geom="errorbar", width=0.1) +
  scale_colour_manual(values=c("blue", "red")) +
  theme_classic() +
  labs(x="Infant Age",y="Satiety-Responsiveness", shape="Mode of Delivery", colour="Mode of Delivery") +
  theme(plot.title = element_text(hjust=0.5)) +
  coord_cartesian(ylim=c(1.6,2.5)) +
  scale_x_discrete(limits=c("2 weeks", "2 months", "4 months", "6 months")) +
  geom_text(data=pvalue_adj, aes(x=Time,y=1.61, label=paste("p=",SRpvalue))) +
  theme(legend.position=c(0.85, 0.85), legend.box.background = element_rect(size=1.2)) +
  theme(text=element_text(size=17))

##################################################################################
#this section for wlz analysis between modes of delivery

#testing significance of wlz first using t tests
#function to determine p value of t-test between modes of delivery at time y, as above
wlz_pvalue_calc <- function(x){round(t.test(wlz~baby_delivery, FeedBhvr_Trend[FeedBhvr_Trend$Time==x,])$p.value, digits=3)}

#making dataframe so i can add p values to timewise plot, as above
timepoints_long <- c(timepoints,"9 months", "12 months")
wlz_pvalue_results <- data.frame("Time"=timepoints_long, pvalue=sapply(timepoints_long, wlz_pvalue_calc))
wlz_padj <- data.frame("Time"=timepoints_long, pvalue=round(p.adjust(wlz_pvalue_results$pvalue, method="BH"), digits=3))

#plotting wlz over time between modes of delivery
ggplot(FeedBhvr_Trend,aes(x=Time)) +
  stat_summary(aes(y=wlz, colour=baby_delivery, group=baby_delivery), fun=mean, geom="line") +
  stat_summary(aes(y=wlz, shape=baby_delivery, colour=baby_delivery), fun=mean, geom="point", size=3) +
  scale_shape_manual(values=c(15,17)) +
  stat_summary(aes(y=wlz, colour=baby_delivery), fun.data=mean_se, geom="errorbar", width=0.1) +
  scale_colour_manual(values=c("blue", "red")) +
  theme_classic() +
  labs(x="Infant Age",y="Weight/Length Z-score", shape="Mode of Delivery", colour="Mode of Delivery") +
  theme(plot.title = element_text(hjust=0.5)) +
  coord_cartesian(ylim=c(-1,1.2)) +
  geom_text(data=wlz_padj, aes(x=Time,y=-0.9, label=paste("p=",pvalue))) +
  theme(legend.position=c(0.17, 0.8), legend.box.background = element_rect(size=1.2)) +
  theme(text=element_text(size=17))


########################################################################################
#calculating n of data points and making a dataframe out of it for reference later on

#making a function to count the samples from babies born via delivery method y, at time x and looking at feeding behavior z
n_calc <- function(x,y,z){
  number <- nrow(filter(FeedBhvr_Trend,baby_delivery==y & Time==x & z!="NA"))
  print(number)
}

#function to run n_calc at all timepoints and both modes of delivery, for feeding behavior a
count_list <- function(a){c(sapply(timepoints_long,n_calc, y="vaginal", z=a), sapply(timepoints_long,n_calc, y="c-section", z=a))}

#making the data frame of the n's
FeedBhvr_Summary <- data.frame("baby_delivery"=c(rep("vaginal",6),rep("c-section",6)), "Time"=c(rep(timepoints_long,2)),
                               "general_appetite_count"=count_list(FeedBhvr_Trend$general_appetite),
                               "enjoyment_food_count"=count_list(FeedBhvr_Trend$enjoyment_food),
                               "satiety_responsiveness_count"=count_list(FeedBhvr_Trend$satiety_responsive),
                               "wlz_count"=count_list(FeedBhvr_Trend$wlz))

#################################################################################################
#This section for beta diversity analysis
#packages needed for this section
library(vegan)
library(phyloseq)
library(DESeq2)

#import files used to create physeq object
biom_file <- import_biom("table-with-taxonomy.biom")
metadata  <- import_qiime_sample_data("infant_metadata.txt")
tree      <- read_tree_greengenes("tree.nwk")

#create physeq object
physeq    <-  merge_phyloseq(biom_file, metadata, tree)

#converting taxonomic rank to names
colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class", "Order",
                                 "Family", "Genus", "Species")

#only keeping samples from infants, and samples where baby_delivery is known
infant <- subset_samples(physeq, life_stage=="Infant" & baby_delivery!="not collected")

#set seed for reproducibility
set.seed(1124)

#setting functions and values for later
calculate_RA <- function(x) x/sum(x)

#########################################################
#making PCoA plots
#rarefy OTU table so all samples have same size
physeq_rar <- rarefy_even_depth(infant, sample.size=4250)

#transforming abundance values to relative abundance
physeq_rar_RA <- transform_sample_counts(physeq_rar, calculate_RA)

#unweighted unifrac analysis and resulting PCoA plot
unifrac <- ordinate(physeq_rar_RA, method="PCoA", distance="unifrac")
plot_ordination(physeq_rar_RA, unifrac, type="sample", color="baby_delivery") +
  scale_color_manual(values=c("blue","red")) +
  stat_ellipse(type="norm", size=1) +
  theme_classic() +
  theme(legend.position=c(0.87, 0.82), legend.box.background = element_rect(size=1.2)) +
  labs(color="Mode of Birth") +
  theme(text=element_text(size=17))

#weighted unifrac and resulting PCoA
wunifrac <- ordinate(physeq_rar_RA, method="PCoA", distance="wunifrac")
plot_ordination(physeq_rar_RA, wunifrac, type="sample", color="baby_delivery") +
  scale_color_manual(values=c("blue","red"))+
  stat_ellipse(type="norm", size=1) +
  theme_classic() +
  theme(legend.position=c(0.87, 0.15), legend.box.background = element_rect(size=1.2)) +
  labs(color="Mode of Birth") +
  theme(text=element_text(size=17))

#jaccard and resulting PCoA
jaccard <- ordinate(physeq_rar_RA, method="PCoA", distance="jaccard")
plot_ordination(physeq_rar_RA, jaccard, type="sample", color="baby_delivery") +
  scale_color_manual(values=c("blue","red"))+
  stat_ellipse(type="norm", size=1) +
  theme_classic() +
  theme(legend.position=c(0.9, 0.885), legend.box.background = element_rect(size=1.2)) +
  labs(color="Mode of Birth") +
  theme(text=element_text(size=17))

#Bray-Curtis and resulting PCoA
bray <- ordinate(physeq_rar_RA, method="PCoA", distance="bray")
plot_ordination(physeq_rar_RA, bray, type="sample", color="baby_delivery") +
  scale_color_manual(values=c("blue","red"))+
  stat_ellipse(type="norm", size=1) +
  theme_classic() +
  theme(legend.position=c(0.9, 0.9), legend.box.background = element_rect(size=1.2)) +
  labs(color="Mode of Birth") +
  theme(text=element_text(size=17))

#################################################################################
#this commented section done in QIIME2: ANCOM setup and running at the genus level between modes of delivery
#for supplemental figure

## Collapse filtered feature table to genus level
# qiime taxa collapse \
# --i-table feature_frequency_infant_filtered_table.qza \
# --i-taxonomy taxonomy.qza \
# --p-level 6 \
# --o-collapsed-table feature_frequency_infant_filtered_table_l6.qza
# 
## Create composition artifact for differential abundance analysis
# qiime composition add-pseudocount \
# --i-table feature_frequency_infant_filtered_table_l6.qza \
# --o-composition-table comp_frequency_infant_table_l6.qza
# 
## ANCOM on baby delivery column
# qiime composition ancom \
# --i-table comp_frequency_infant_table_l6.qza \
# --m-metadata-file infant_metadata.txt \
# --m-metadata-column baby_delivery \
# --o-visualization l6_ancom_delivery.qzv

##################################################################################
#this section to plot the ANCOM results in R
#L6_ancom_delivery.qzv file from above section exported as ANCOM.tsv in view.qiime2.org

#importing ANCOM.tsv
ancom <- read.table(file="ANCOM.tsv", sep='\t', header=TRUE)

#visualize ancom as a scatterplot
ggplot(ancom,aes(y=W, x=clr)) +
  geom_point() +
  theme_classic() +
  labs(x="Centered Log Ratio Transformation",y="W statistic") + 
  xlim(0,12) +
  ylim(0, 210) +
  theme(panel.border=element_rect(color="black", fill=NA)) +
  annotate("text", x=10.3, y=185,label="Bacteroides", size=5) +
  theme(text=element_text(size=17))

#################################################################################
#this section for Bacteroides Relative Abundance Analysis
#subsetting to genera
Genus_RA <- tax_glom(physeq_rar_RA, taxrank = "Genus", NArm = FALSE)

#subsetting to Bacteroides
Bacteroides <- subset_taxa(Genus_RA, Genus == "g__Bacteroides")

#rearranging object so it can be plotted
Bacteroides_melt <- psmelt(Bacteroides)

#plotting relative abundance results
ggplot(Bacteroides_melt, aes(x = baby_delivery, y = Abundance)) + 
  geom_boxplot(aes(fill = baby_delivery)) +
  theme_classic() +
  scale_fill_manual(values=c("blue", "red")) +
  labs(x="Mode of Delivery", y="Relative Abundance", fill="Mode of Delivery", colour="Mode of Delivery") +
  theme(text=element_text(size=17)) +
  theme(legend.position="none")





<Script3.r>
#Correlational Studies

library(ggplot2)
library(tidyverse)
library(EnvStats)
library(ggsignif)
library(broom)
library(ggpubr)

infant_metadata <- readxl::read_xlsx("infant_metadata.xlsx", 
                                     col_names = TRUE)

#Filter metadata down to needed columns
infant_metadata_filter <- infant_metadata 
infant_metadata_filter <- select(infant_metadata_filter, 
                                 c("#SampleID", "life_stage", "age_category", 
                                   "anonymized_name", "baby_delivery", "host_weight", 
                                   "infant_waz",	"infant_waz_comp_to_median",	"infant_wlz",
                                   "infant_wlz_comp_to_median", "mom_bmi", "wlz_2m", 
                                   "wlz_2w", "wlz_4m", "wlz_6m", "wlz_9m", "wlz_12m",
                                   "wlz_avg_bin", "satiety_responsive",	"satiety_responsive_2_months",
                                   "satiety_responsive_2_weeks",	"satiety_responsive_4_months",
                                   "satiety_responsive_6_months",	"satiety_responsive_comp_to_median",
                                   "feed", "sex"))

# BMI reference: https://www.nih.gov/news-events/news-releases/nih-study-identifies-ideal-body-mass-index#:~:text=Current%20guidelines%20from%20the%20U.S.,as%20BMI%2035%20or%20higher.

#Sort data based on BMI
#Set variable
x <- infant_metadata_filter$mom_bmi
#Mutate to make a new column: Weight_Category
infant_metadata_filter_cat_BMI <- mutate(infant_metadata_filter, 
                                         Mother_Weight_Category = ifelse((x == "not applicable"), "NA", 
                                                                         ifelse((x < 18.5), "underweight", 
                                                                                ifelse((x >= 18.5 & x <24.9), "normal", 
                                                                                       ifelse((x >= 24.9 & x <= 29.9), "overweight", 
                                                                                              ifelse((x >= 30), "obese", "NA"))))))

#Subdivide data based on Mother's Weight_Category --> result: 4 datasets based on mother's weight
#Note: 0 underweight mothers: 36 normal, 42 obese, 35 overweight
#Note: ONLY CONTAINS MOTHERS
underweight_mom_only <- filter(infant_metadata_filter_cat_BMI, 
                               Mother_Weight_Category == "underweight")
normal_mom_only <- filter(infant_metadata_filter_cat_BMI, 
                          Mother_Weight_Category == "normal")
overweight_mom_only <- filter(infant_metadata_filter_cat_BMI, 
                              Mother_Weight_Category == "overweight")
obese_mom_only <- filter(infant_metadata_filter_cat_BMI, 
                         Mother_Weight_Category == "obese")



# note: the data above is filtered down to MOTHERS ONLY. 
# take the anonymized_name (which is the same between mother and infant)
# then apply it to the infant_metadata_filter_cat_BMI to keep both baby and mom

normal_mom_AND_infant <- filter(infant_metadata_filter_cat_BMI, 
                                anonymized_name %in% normal_mom_only$anonymized_name)

overweight_mom_AND_infant <- filter(infant_metadata_filter_cat_BMI, 
                                    anonymized_name %in% overweight_mom_only$anonymized_name)

obese_mom_AND_infant <- filter(infant_metadata_filter_cat_BMI, 
                               anonymized_name %in% obese_mom_only$anonymized_name)


#Generate infant ONLY data + add mom weight description
normal_mom_infant_only <- subset(normal_mom_AND_infant, life_stage == "Infant")
normal_mom_infant_only <- mutate(normal_mom_infant_only, 
                                 mom_weight_cat = ifelse(normal_mom_infant_only$Mother_Weight_Category == "NA",
                                                         "normal", "failed"))
overweight_mom_infant_only <- subset(overweight_mom_AND_infant, life_stage == "Infant")
overweight_mom_infant_only <- mutate(overweight_mom_infant_only, 
                                     mom_weight_cat = ifelse(overweight_mom_infant_only$Mother_Weight_Category == "NA",
                                                             "overweight", "failed"))
obese_mom_infant_only <- subset(obese_mom_AND_infant, life_stage == "Infant")
obese_mom_infant_only <- mutate(obese_mom_infant_only, 
                                mom_weight_cat = ifelse(obese_mom_infant_only$Mother_Weight_Category == "NA",
                                                        "obese", "failed"))

infant_only_norm_over <- merge(normal_mom_infant_only, 
                               overweight_mom_infant_only, 
                               all = TRUE)
infant_only_combined <- merge(infant_only_norm_over, 
                              obese_mom_infant_only, 
                              all = TRUE)

##################################
satiety_2weeks <- data.frame("anonymized_name" = infant_only_combined$anonymized_name, 
                             "feed" = infant_only_combined$feed, 
                             "satiety" = infant_only_combined$satiety_responsive_2_weeks, 
                             "mom_bmi" = infant_only_combined$mom_weight_cat,
                             "baby_delivery" = infant_only_combined$baby_delivery,
                             "sex" = infant_only_combined$sex, 
                             "time" = "2 weeks")


satiety_2months <- data.frame("anonymized_name" = infant_only_combined$anonymized_name, 
                             "feed" = infant_only_combined$feed, 
                             "satiety" = infant_only_combined$satiety_responsive_2_months, 
                             "mom_bmi" = infant_only_combined$mom_weight_cat, 
                             "baby_delivery" = infant_only_combined$baby_delivery,
                             "sex" = infant_only_combined$sex,
                             "time" = "2 months")

satiety_4months <- data.frame("anonymized_name" = infant_only_combined$anonymized_name, 
                              "feed" = infant_only_combined$feed, 
                              "satiety" = infant_only_combined$satiety_responsive_4_months, 
                              "mom_bmi" = infant_only_combined$mom_weight_cat, 
                              "baby_delivery" = infant_only_combined$baby_delivery,
                              "sex" = infant_only_combined$sex,
                              "time" = "4 months")

satiety_6months <- data.frame("anonymized_name" = infant_only_combined$anonymized_name, 
                              "feed" = infant_only_combined$feed, 
                              "satiety" = infant_only_combined$satiety_responsive_6_months, 
                              "mom_bmi" = infant_only_combined$mom_weight_cat, 
                              "baby_delivery" = infant_only_combined$baby_delivery,
                              "sex" = infant_only_combined$sex,
                              "time" = "6 months")

satiety_combined <- rbind(satiety_2weeks, satiety_2months, satiety_4months, satiety_6months)
satiety_combined[3] <- lapply(satiety_combined[3], as.numeric) 
satiety_combined_vaginal <- filter(satiety_combined, 
                                   baby_delivery == "vaginal") 


#Not filtered by vaginal birth 
ggplot(satiety_combined, 
       aes(y = satiety, x = mom_bmi)) + 
  facet_wrap(~time) +
  stat_n_text() +
  theme_bw() +
  geom_boxplot() +
  labs(y = "Infant Satiety Responsiveness", 
       x = "Maternal Weight Class", 
       title = "ALL Infant Satiety responsiveness over time")

satiety_combined_vaginal$time <- factor(satiety_combined_vaginal$time, levels=c("2 weeks", "2 months", "4 months", "6 months"))
satiety_combined_vaginal$mom_bmi <- factor(satiety_combined_vaginal$mom_bmi, levels=c("normal", "overweight", "obese"))

#filtered by vaginal birth (Figure 3B)
ggplot(satiety_combined_vaginal,
       aes(y = satiety, x = mom_bmi)) +
  facet_wrap( ~ time) +
  stat_n_text(y.pos = 0) +
  theme_bw() +
  theme(text=element_text(size=17)) +
  geom_boxplot() +
  coord_cartesian(ylim = c(0, 5)) +
  labs(y = "Infant Satiety Responsiveness",
       x = "Maternal Weight Class",
       title = "Vaginally delivered Infant Satiety responsiveness over time")

####### WLZ
wlz_2weeks <- data.frame("anonymized_name" = infant_only_combined$anonymized_name, 
                         "feed" = infant_only_combined$feed, 
                         "wlz" = infant_only_combined$wlz_2w, 
                         "mom_bmi" = infant_only_combined$mom_weight_cat,
                         "baby_delivery" = infant_only_combined$baby_delivery, 
                         "sex" = infant_only_combined$sex,
                         "time" = "2 weeks")


wlz_2months <- data.frame("anonymized_name" = infant_only_combined$anonymized_name, 
                              "feed" = infant_only_combined$feed, 
                              "wlz" = infant_only_combined$wlz_2m, 
                              "mom_bmi" = infant_only_combined$mom_weight_cat,
                          "baby_delivery" = infant_only_combined$baby_delivery,
                          "sex" = infant_only_combined$sex,
                              "time" = "2 months")

wlz_4months <- data.frame("anonymized_name" = infant_only_combined$anonymized_name, 
                              "feed" = infant_only_combined$feed, 
                              "wlz" = infant_only_combined$wlz_4m, 
                              "mom_bmi" = infant_only_combined$mom_weight_cat, 
                          "baby_delivery" = infant_only_combined$baby_delivery,
                          "sex" = infant_only_combined$sex,
                              "time" = "4 months")

wlz_6months <- data.frame("anonymized_name" = infant_only_combined$anonymized_name, 
                              "feed" = infant_only_combined$feed, 
                              "wlz" = infant_only_combined$wlz_6m, 
                              "mom_bmi" = infant_only_combined$mom_weight_cat,
                          "baby_delivery" = infant_only_combined$baby_delivery,
                          "sex" = infant_only_combined$sex,
                              "time" = "6 months")

wlz_12months <- data.frame("anonymized_name" = infant_only_combined$anonymized_name, 
                          "feed" = infant_only_combined$feed, 
                          "wlz" = infant_only_combined$wlz_12m, 
                          "mom_bmi" = infant_only_combined$mom_weight_cat, 
                          "baby_delivery" = infant_only_combined$baby_delivery,
                          "sex" = infant_only_combined$sex,
                          "time" = "12 months")

wlz_combined <- rbind(wlz_2weeks, wlz_2months, wlz_4months, wlz_6months, wlz_12months)
wlz_combined[3] <- lapply(wlz_combined[3], as.numeric)
wlz_combined_vaginal <- filter(wlz_combined, 
                               baby_delivery == "vaginal")

wlz_combined$time <- factor(wlz_combined$time, levels=c("2 weeks", "2 months", "4 months", "6 months", "12 months"))
wlz_combined$mom_bmi <- factor(wlz_combined$mom_bmi, levels=c("normal", "overweight", "obese"))


wlz_combined_vaginal$time <- factor(wlz_combined_vaginal$time, levels=c("2 weeks", "2 months", "4 months", "6 months", "12 months"))
wlz_combined_vaginal$mom_bmi <- factor(wlz_combined_vaginal$mom_bmi, levels=c("normal", "overweight", "obese"))

#Figure plot (3A)
ggplot(wlz_combined_vaginal,
       aes(y = wlz, x = mom_bmi)) +
  facet_wrap(~ time) +
  stat_n_text() +
  theme_bw() +
  theme(text=element_text(size=17)) +
  geom_boxplot() +
  coord_cartesian(ylim = c(-5, 4)) +
  labs(y = "Infant Weight-for-length Z score",
       x = "Maternal Weight Class",
       title = "Vaginally delivered infant weight-for-length Z score by maternal BMI")

#######

#ANOVA stats
#Anova tests: 

#2 weeks WLZ
wlz_2weeks[3] <- lapply(wlz_2weeks[3], as.numeric)
wlz_2weeks_filter <- filter(wlz_2weeks, 
                     baby_delivery == "vaginal")

wlz_2weeks_anova <- lm(wlz ~ mom_bmi, data = wlz_2weeks_filter)
anova(wlz_2weeks_anova)
wlz_2weeks_anova_tukey <- TukeyHSD(aov(wlz_2weeks_anova))
tidy(wlz_2weeks_anova_tukey)

#2 months WLZ
wlz_2months[3] <- lapply(wlz_2months[3], as.numeric)
wlz_2months_filter <- filter(wlz_2months, 
                            baby_delivery == "vaginal")

wlz_2months_anova <- lm(wlz ~ mom_bmi, data = wlz_2months_filter)
anova(wlz_2months_anova)
wlz_2months_anova_tukey <- TukeyHSD(aov(wlz_2months_anova))
tidy(wlz_2months_anova_tukey)


#4 months WLZ
wlz_4months[3] <- lapply(wlz_4months[3], as.numeric)
wlz_4months_filter <- filter(wlz_4months, 
                             baby_delivery == "vaginal")

wlz_4months_anova <- lm(wlz ~ mom_bmi, data = wlz_4months_filter)
anova(wlz_4months_anova)
wlz_4months_anova_tukey <- TukeyHSD(aov(wlz_4months_anova))
tidy(wlz_4months_anova_tukey)


#6 months WLZ
wlz_6months[3] <- lapply(wlz_6months[3], as.numeric)
wlz_6months_filter <- filter(wlz_6months, 
                             baby_delivery == "vaginal")

wlz_6months_anova <- lm(wlz ~ mom_bmi, data = wlz_6months_filter)
anova(wlz_6months_anova)
wlz_6months_anova_tukey <- TukeyHSD(aov(wlz_6months_anova))
tidy(wlz_6months_anova_tukey)

#12 months WLZ
wlz_12months[3] <- lapply(wlz_12months[3], as.numeric)
wlz_12months_filter <- filter(wlz_12months, 
                             baby_delivery == "vaginal")

wlz_12months_anova <- lm(wlz ~ mom_bmi, data = wlz_12months_filter)
anova(wlz_12months_anova)
wlz_12months_anova_tukey <- TukeyHSD(aov(wlz_12months_anova))
tidy(wlz_12months_anova_tukey)


    
#####################
#ANOVA satiety 

#Satiety 2 weeks
satiety_2weeks[3] <- lapply(satiety_2weeks[3], as.numeric)
satiety_2weeks_filter <- filter(satiety_2weeks, 
                              baby_delivery == "vaginal")

satiety_2weeks_anova <- lm(satiety ~ mom_bmi, data = satiety_2weeks_filter)
anova(satiety_2weeks_anova)
satiety_2weeks_anova_tukey <- TukeyHSD(aov(satiety_2weeks_anova))
tidy(satiety_2weeks_anova_tukey)

#2 months
satiety_2months[3] <- lapply(satiety_2months[3], as.numeric)
satiety_2months_filter <- filter(satiety_2months, 
                                baby_delivery == "vaginal")

satiety_2months_anova <- lm(satiety ~ mom_bmi, data = satiety_2months_filter)
anova(satiety_2months_anova)
satiety_2months_anova_tukey <- TukeyHSD(aov(satiety_2months_anova))
tidy(satiety_2months_anova_tukey)


#satiety 4 months
satiety_4months[3] <- lapply(satiety_4months[3], as.numeric)
satiety_4months_filter <- filter(satiety_4months, 
                                 baby_delivery == "vaginal")

satiety_4months_anova <- lm(satiety ~ mom_bmi, data = satiety_4months_filter)
anova(satiety_4months_anova)
satiety_4months_anova_tukey <- TukeyHSD(aov(satiety_4months_anova))
tidy(satiety_4months_anova_tukey)


#satiety 6 months
satiety_6months[3] <- lapply(satiety_6months[3], as.numeric)
satiety_6months_filter <- filter(satiety_6months, 
                                 baby_delivery == "vaginal")

satiety_6months_anova <- lm(satiety ~ mom_bmi, data = satiety_6months_filter)
anova(satiety_6months_anova)
satiety_6months_anova_tukey <- TukeyHSD(aov(satiety_6months_anova))
tidy(satiety_6months_anova_tukey)

##################################################################################################################
DATA WRANGLING FOR BETA DIVERSITY AND DAA
#################################################################################################################

#this script filters infant_metadata.xlsx 
#this script conducts correlation analyses between infant Weight_for_length 
#and satiety responsiveness vs Maternal Weight Category 

library(tidyverse)
library(dplyr)
library(Hmisc)

##################################################################################################################################################
#edit metadata to include maternal bmi category 
infant_metadata <- readxl::read_xlsx("infant_metadata.xlsx", 
                                     col_names = TRUE)

test <- select(infant_metadata_cat_BMI, 
               "life_stage", "mom_bmi", "anonymized_name", "age_category")

x <- infant_metadata$mom_bmi

#Mutate to make a new column: Weight_Category
infant_metadata_cat_BMI <- mutate(infant_metadata, 
                                  Mother_Weight_Category = ifelse((x == "not applicable"), "NA", 
                                                                  ifelse((x < 18.5), "underweight", 
                                                                         ifelse((x >= 18.5 & x <24.9), "normal", 
                                                                                ifelse((x >= 24.9 & x <= 29.9), "overweight", 
                                                                                       ifelse((x >= 30), "obese", "NA"))))))




underweight_mom_only <- filter(infant_metadata_cat_BMI, 
                               Mother_Weight_Category == "underweight")
normal_mom_only <- filter(infant_metadata_cat_BMI, 
                          Mother_Weight_Category == "normal")
overweight_mom_only <- filter(infant_metadata_cat_BMI, 
                              Mother_Weight_Category == "overweight")
obese_mom_only <- filter(infant_metadata_cat_BMI, 
                         Mother_Weight_Category == "obese")

#######################################
#Design a system: Going from normal --> overweight --> obese
#when anonymized name is used once, it's removed for future uses
#######################################

used_list <- infant_metadata$anonymized_name
used_list <- as.data.frame(used_list)

#filter normal mothers and infants
#take infants that have IDs the same as Normal mothers
normal_mom_AND_infant <- filter(infant_metadata_cat_BMI, 
                                anonymized_name %in% normal_mom_only$anonymized_name)
#a few mom_bmi for the same anonymized ID have bmi taken at DIFFERENT TIME PONITS
#this results in 1 anonymized name having MULTIPLE BMI definitions (e.g. 0.5 months = obese, 4 months = overweight)
#here, we filter based on Mother_weight_category being normal or NA
#the same anonymized name will be categorized differently
normal_mom_AND_infant <- filter(normal_mom_AND_infant, Mother_Weight_Category %in% c("normal", "NA"))
check_normal1 <- select(normal_mom_AND_infant, "#SampleID", "mom_bmi", "Mother_Weight_Category", "anonymized_name", "life_stage")

#Keep items that are NOT IN normal_mom_AND_infant$anonymized_name
used_list <- filter(used_list, used_list %nin% normal_mom_AND_infant$anonymized_name)

#same with overweight
#this creates a new dataframe that includes only anonymized names that haven't been used
overweight_filtered <- filter(overweight_mom_only, 
                              anonymized_name %in% used_list$used_list)
overweight_mom_AND_infant <- filter(infant_metadata_cat_BMI, 
                                    anonymized_name %in% overweight_filtered$anonymized_name)
overweight_mom_AND_infant <- filter(overweight_mom_AND_infant, Mother_Weight_Category %in% c("overweight", "NA"))
check_overweight1 <- select(overweight_mom_AND_infant, "#SampleID", "mom_bmi", "Mother_Weight_Category", "anonymized_name", "life_stage")

#Keep items that are NOT IN normal_mom_AND_infant$anonymized_name AND overweght_mom_AND_infant$anonymized name
used_list <- filter(used_list, used_list %nin% overweight_mom_AND_infant$anonymized_name)

#used_list should contain only remaining ones for obese

#same with obese
obese_filtered <- filter(obese_mom_only, 
                         anonymized_name %in% used_list$used_list)
obese_mom_AND_infant <- filter(infant_metadata_cat_BMI, 
                               anonymized_name %in% obese_filtered$anonymized_name)
obese_mom_AND_infant <- filter(obese_mom_AND_infant, Mother_Weight_Category %in% c("overweight", "NA"))
check_obese1 <- select(obese_mom_AND_infant, "#SampleID", "mom_bmi", "Mother_Weight_Category", "anonymized_name", "life_stage")

#Generate infant ONLY data + add mom weight description
normal_mom_infant_only <- subset(normal_mom_AND_infant, life_stage == "Infant")
normal_mom_infant_only <- mutate(normal_mom_infant_only, 
                                 mom_weight_cat = ifelse(normal_mom_infant_only$Mother_Weight_Category == "NA",
                                                         "normal", "failed"))
check_normal <- select(normal_mom_infant_only, 
                       "#SampleID", "mom_bmi", "mom_weight_cat")

overweight_mom_infant_only <- subset(overweight_mom_AND_infant, life_stage == "Infant")
overweight_mom_infant_only <- mutate(overweight_mom_infant_only, 
                                     mom_weight_cat = ifelse(overweight_mom_infant_only$Mother_Weight_Category == "NA",
                                                             "overweight", "failed"))
obese_mom_infant_only <- subset(obese_mom_AND_infant, life_stage == "Infant")
obese_mom_infant_only <- mutate(obese_mom_infant_only, 
                                mom_weight_cat = ifelse(obese_mom_infant_only$Mother_Weight_Category == "NA",
                                                        "obese", "failed"))

infant_only_norm_over <- merge(normal_mom_infant_only, 
                               overweight_mom_infant_only, 
                               all = TRUE)
infant_only_combined <- merge(infant_only_norm_over, 
                              obese_mom_infant_only, 
                              all = TRUE)

total_data <- merge(infant_only_combined, infant_metadata_cat_BMI, all = TRUE)

#infant_metadata_cat_BMI is now has mothers classified as obese, overweight, normal, underweight

#Save file
write.table(total_data,
            file = "qiime_files/infant_metadata_infant_only_combined_with_mat_weight_cat.tsv", 
            row.names=TRUE, 
            sep="\t")

##################################################################################################################
BETA DIVERSITY/PCOA PLOTS
##################################################################################################################
#Differential abundance analysis 
#Goal: see if there are different microbes in obese infants vs overweight vs normal **
#this script conducts beta divresity analyses (Wunifrac) and plots PCoA plots
#for mothers and vaginally delivered infants.

library(tidyverse) 
library(vegan) 
library(phyloseq) 
library(DESeq2) 

#create gm_mean functions
gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[ x> 0]), na.rm = na.rm) / length(x))
}

infant_mat_bmi <- read.table("qiime_files/infant_metadata_infant_only_combined_with_mat_weight_cat.tsv",
                             header = TRUE, 
                             sep = "\t")

#load file into R, load into an object, and then manipulate 
biom_file <- import_biom("qiime_files/table-with-taxonomy.biom")
metadata <- import_qiime_sample_data("qiime_files/infant_metadata_infant_only_combined_with_mat_weight_cat.tsv")
tree <- read_tree_greengenes("qiime_files/tree.nwk")

#create phylseq object
infant_physeq <- merge_phyloseq(biom_file,
                                metadata, 
                                tree)

set.seed(1124) 

#taxonomic rank names 
rank_names(infant_physeq)

#get the column names of the tax table, and then rename them
colnames(tax_table(infant_physeq)) <- c("Kingdom", "Phylum", "Class",
                                 "Order","Family", "Genus", "Species")

#change the rank names. the names of the ranks are in the taxonomy table of the physeq object
rank_names(infant_physeq)


#beta diversity PCoA plot
#diversity requres rarefied taxa table
#we know from qiime that we should rarefy to 4250 
physeq_rar <- rarefy_even_depth(infant_physeq, sample.size = 4250)

#convert to RA (relative abundance)
physeq_rar_RA <- transform_sample_counts(physeq_rar, 
                                         function(x) x/sum(x))
#function(x) x/sum(x) means given x, divide x by the sum of all x's 
#we define the type of analysis with ordinate() and setting the 
#method = PCoA, setting distance to type of beta diversity analysis 
#you can change analysis to jaccard, bray-curtis and so on

#unfiltered data
ord <- ordinate(physeq = physeq_rar_RA, 
                method = "PCoA", 
                distance = "wunifrac")
plot_ordination(physeq_rar_RA, 
                ord, 
                type = "sample", 
                color = "mom_weight_cat", 
                title = "PCoA Unfiltered (weighted unifrac)") 

#Filter based on metadata: mothers only 
mother_only_physeq <- subset_samples(infant_physeq, life_stage == "Adult")
physeq_rar_mother_only <- rarefy_even_depth(mother_only_physeq, sample.size = 4250)
physeq_rar_mother_onlyRA <- transform_sample_counts(physeq_rar_mother_only, 
                                                    function(x) x/sum(x))
mother_only_ord <- ordinate(physeq = physeq_rar_mother_onlyRA, 
                            method = "PCoA", 
                            distance = "wunifrac")
#Figure 4A
plot_ordination(physeq_rar_mother_onlyRA, 
                mother_only_ord, 
                type = "sample", 
                color = "Mother_Weight_Category", 
                title = "PCoA Mothers Only (Weighted Unifrac)") +
  theme_bw() +
  theme(text=element_text(size=17)) 


#filter based on metadata: infants only
infant_only_physeq <- subset_samples(infant_physeq, life_stage == "Infant")
infant_only_physeq <- subset_samples(infant_physeq, mom_weight_cat %in% c("obese", "overweight", "normal"))
physeq_rar_infant_only <- rarefy_even_depth(infant_only_physeq, sample.size = 4250)
physeq_rar_infant_onlyRA <- transform_sample_counts(physeq_rar_infant_only, 
                                                    function(x) x/sum(x))

infant_only_ord <- ordinate(physeq = physeq_rar_infant_onlyRA, 
                            method = "PCoA", 
                            distance = "wunifrac")

plot_ordination(physeq_rar_infant_onlyRA, 
                infant_only_ord, 
                type = "sample", 
                color = "mom_weight_cat", 
                title = "PCoA Infants Only (Weighted Unifrac)",
                size) 


#Vaginal infants only
infant_only_physeq_vaginal <- subset_samples(infant_only_physeq, baby_delivery == "vaginal")
physeq_rar_infant_only_vaginal <- rarefy_even_depth(infant_only_physeq_vaginal, sample.size = 4250)
physeq_rar_infant_only_vaginal_RA <- transform_sample_counts(physeq_rar_infant_only_vaginal, 
                                                    function(x) x/sum(x))

vaginal_infant_ord <- ordinate(physeq = physeq_rar_infant_only_vaginal_RA, 
                               method = "PCoA", 
                               distance = "wunifrac")
#Figure 5A
plot_ordination(physeq_rar_infant_only_vaginal_RA, 
                vaginal_infant_ord, 
                type = "sample", 
                color = "mom_weight_cat", 
                title = "PCoA Vaginal Infants only (Weighted Unifrac)") + 
  theme_bw() + 
  theme(text=element_text(size=17)) 

##################################################################################################################
DIFFERENTIAL ABUNDANCE/RELATIVE ABUNDANCE
##################################################################################################################
#This script conducts differential abundance and
#relative abundance analyses for mothers and vaginally delivered infants
  
library(tidyverse)
library(vegan)
library(phyloseq)
library(DESeq2)
library(EnvStats)
library(broom)


################################################################################
#Relative abundance 
calculate_RA <- function(x) x/sum(x)

gm_mean <- function(x, na.rm = TRUE) {
  exp(sum(log(x[ x> 0]), na.rm = na.rm) / length(x))
}


infant_mat_bmi <- read.table("qiime_files/infant_metadata_infant_only_combined_with_mat_weight_cat.tsv",
                             header = TRUE, 
                             sep = "\t")

#load file into R, load into an object, and then manipulate 
#import_biom is  specialized function that helps us import biom files into R
biom_file <- import_biom("qiime_files/table-with-taxonomy.biom")
metadata <- import_qiime_sample_data("qiime_files/infant_metadata_infant_only_combined_with_mat_weight_cat.tsv")
tree <- read_tree_greengenes("qiime_files/tree.nwk")


infant_physeq <- merge_phyloseq(biom_file,
                                metadata, 
                                tree)
# Convert taxonomic rank from numbers to proper names
colnames(tax_table(infant_physeq)) <- c("Kingdom", "Phylum", "Class", "Order",
                                 "Family", "Genus", "Species")


# Keep only abundant ASVs
# First determine counts of ASV across all samples
total_counts <- taxa_sums(infant_physeq)


#Calculate relative abundance for each ASV
relative_abundance <- calculate_RA(total_counts)

# Determine which ASVs are more abundant than X cutoff
# Change this if you want a different cutoff (0.0001 = 0.01%)
abundant <- relative_abundance > 0.0001
abundant_taxa <- prune_taxa(abundant, infant_physeq)

family <- tax_glom(abundant_taxa, taxrank = "Family", NArm = FALSE)

#Keep infant samples only  
infant_family <- subset_samples(family, life_stage == "Infant")
infant_family_vaginal <- subset_samples(infant_family, baby_delivery == "vaginal")
infant_family_vaginal_narm <- subset_samples(infant_family_vaginal, mom_weight_cat != "NA")
infant_family_vaginal_overweight_norm <- subset_samples(infant_family_vaginal_narm, mom_weight_cat %in% c("normal", "overweight"))
infant_family_vaginal_obese_norm <- subset_samples(infant_family_vaginal_narm, mom_weight_cat %in% c("normal", "obese"))
infant_family_vaginal_obese_overweight <- subset_samples(infant_family_vaginal_narm, mom_weight_cat %in% c("obese", "overweight"))

deseq_feature <- phyloseq_to_deseq2(infant_family_vaginal_narm, ~ mom_weight_cat)

geo_means <- apply(counts(deseq_feature), 1, gm_mean)
deseq_feature <- estimateSizeFactors(deseq_feature, geoMeans = geo_means)
deseq_feature <- DESeq(deseq_feature, fitType="local")

feature_diff_abund <- results(deseq_feature)
# Define cutoff for the adjusted p-value
alpha <- 0.05
# Reformat information as data frame including feature as variable
significant_feature <- as_tibble(feature_diff_abund, rownames = "feature")
# Keep only significant results and sort by adjusted p-value
significant_feature <- filter(significant_feature, padj < alpha)
significant_feature <- arrange(significant_feature, padj)

# Get the taxonomic information as a data frame
taxa_df <- as_tibble(as.data.frame(tax_table(infant_physeq)), rownames = "feature")

# Combine the significant features with taxonomic classification
significant_feature <- inner_join(significant_feature, taxa_df)

dim(significant_feature)

# Plot differential abundance
significant_feature %>%
  ggplot(aes(x = log2FoldChange, y = Family)) +
  geom_col() + 
  theme_bw() + 
  labs(x = "log2FoldChange", 
       y = "Family", 
       title = "Vaginally Delivered Infants")

physeq_RA_infant <- transform_sample_counts(infant_physeq, calculate_RA)
family_RA_infant <- tax_glom(physeq_RA_infant, taxrank = "Family", NArm = FALSE)

# graph erysipelotrichaceae  and filter for that family with subset_taxa.
erysipelotrichaceae <- subset_samples(family_RA_infant, life_stage == "Infant")
erysipelotrichaceae <- subset_samples(erysipelotrichaceae, baby_delivery == "vaginal")
erysipelotrichaceae <- subset_samples(erysipelotrichaceae, mom_weight_cat != "NA")
erysipelotrichaceae <- subset_taxa(erysipelotrichaceae, Family == "f__Erysipelotrichaceae")

# Change the table from wide to long format.
erysipelotrichaceae_melt <- psmelt(erysipelotrichaceae)

#Figure 5B
ggplot(erysipelotrichaceae_melt, aes(x = as.factor(mom_weight_cat), y = Abundance)) + 
  # Visualize data as Boxplot
  geom_boxplot(aes(fill = as.factor(mom_weight_cat))) + 
  # Label for x-axis
  xlab("Mother Weight Class") +
  # Label for y-axis
  ylab("Relative Abundance") +
  # Plot title
  ggtitle("Erysipelotrichaceae") + 
  # colors for the different groups (here subject 1 and 2)
  scale_fill_manual(values=c("seagreen3", "indianred1", "black")) +
  guides(fill = FALSE) +
  # Set a standard theme for plot
  theme_bw(base_size = 16) +
  stat_n_text() 


# graph xanthomonadaceae  and filter for that family with subset_taxa.
xanthomonadaceae <- subset_samples(family_RA_infant, life_stage == "Infant")
xanthomonadaceae <- subset_samples(xanthomonadaceae, baby_delivery == "vaginal")
xanthomonadaceae <- subset_samples(xanthomonadaceae, mom_weight_cat != "NA")
xanthomonadaceae <- subset_taxa(xanthomonadaceae, Family == "f__Xanthomonadaceae")

# change the table from wide to long format.
xanthomonadaceae_melt <- psmelt(xanthomonadaceae)

#Figure 5C
ggplot(xanthomonadaceae_melt, aes(x = as.factor(mom_weight_cat), y = Abundance)) + 
  # Visualize data as Boxplot
  geom_boxplot(aes(fill = as.factor(mom_weight_cat))) + 
  # Label for x-axis
  xlab("Mother Weight Class") +
  # Label for y-axis
  ylab("Relative Abundance") +
  # Plot title
  ggtitle("Xanthomonadaceae") + 
  # colors for the different groups (here subject 1 and 2)
  scale_fill_manual(values=c("seagreen3", "indianred1", "black")) +
  guides(fill = FALSE) +
  # Set a standard theme for plot
  theme_bw(base_size = 16) +
  stat_n_text() 


#########################
#mothers 

mothers_family <- subset_samples(family, life_stage == "Adult")
mothers_family_obese_normal <- subset_samples(mothers_family, Mother_Weight_Category %in% c("obese", "normal"))
mothers_family_overweight_normal <- subset_samples(mothers_family, Mother_Weight_Category %in% c("overweight", "normal"))
mothers_family_obese_overweight <- subset_samples(mothers_family, Mother_Weight_Category %in% c("obese", "overweight"))

deseq_feature_mother <- phyloseq_to_deseq2(mothers_family, ~ Mother_Weight_Category)

geo_means_mother <- apply(counts(deseq_feature_mother), 1, gm_mean)
deseq_feature_mother <- estimateSizeFactors(deseq_feature_mother, geoMeans = geo_means)
deseq_feature_mother <- DESeq(deseq_feature_mother, fitType="local")

feature_diff_abund_mother <- results(deseq_feature_mother)
alpha <- 0.05
# Reformat information as data frame including feature as variable
significant_feature_mother <- as_tibble(feature_diff_abund_mother, rownames = "feature")
# Keep only significant results and sort by adjusted p-value
significant_feature_mother <- filter(significant_feature_mother, padj < alpha)
significant_feature_mother <- arrange(significant_feature_mother, padj)

# Get the taxonomic information as a data frame
taxa_df_mother <- as_tibble(as.data.frame(tax_table(infant_physeq)), rownames = "feature")

# Combine the significant features with taxonomic classification
significant_feature_mother <- inner_join(significant_feature_mother, taxa_df)

# there 14 features that are different at FDR-corrected p < 0.05!
dim(significant_feature_mother)

# Plot differential abundance
significant_feature_mother %>%
  ggplot(aes(x = log2FoldChange, y = Family)) +
  geom_col() + 
  theme_bw() + 
  labs(x = "log2FoldChange", 
       y = "Family", 
       title = "Differential Abundance in Mothers")

################ Relative Abundance
physeq_RA_mother <- transform_sample_counts(infant_physeq, calculate_RA)
family_RA_mother <- tax_glom(physeq_RA_mother, taxrank = "Family", NArm = FALSE)

# graph paraprevotellaceae  and filter for that family with subset_taxa.
paraprevotellaceae <- subset_samples(family_RA_mother, life_stage == "Adult")
paraprevotellaceae <- subset_samples(paraprevotellaceae, Mother_Weight_Category != "NA")
paraprevotellaceae <- subset_taxa(paraprevotellaceae, Family == "f__[Paraprevotellaceae]")

# plot the data by changing the table from wide to long format.
paraprevotellaceae_melt <- psmelt(paraprevotellaceae)


#Figure 4B
ggplot(paraprevotellaceae_melt, aes(x = as.factor(Mother_Weight_Category), y = Abundance)) + 
  # Visualize data as Boxplot
  geom_boxplot(aes(fill = as.factor(Mother_Weight_Category))) + 
  # Label for x-axis
  xlab("Mother Weight Category") +
  # Label for y-axis
  ylab("Relative Abundance") +
  # Plot title
  ggtitle("Paraprevotellaceae") + 
  # colors for the different groups (here subject 1 and 2)
  scale_fill_manual(values=c("seagreen3", "indianred1", "black")) +
  guides(fill = FALSE) +
  # Set a standard theme for plot
  theme_bw(base_size = 16) +
  stat_n_text() 





<Script4.sh>
# Calculate alpha-diversity group significance
qiime diversity alpha-group-significance \
 --i-alpha-diversity core_metrics_results_4250_depth/shannon_vector.qza \
 --m-metadata-file infant_metadata.txt \
 --o-visualization core_metrics_results_4250_depth/shannon_group_significance.qzv

qiime diversity alpha-group-significance \
 --i-alpha-diversity core_metrics_results_4250_depth/evenness_vector.qza \
 --m-metadata-file infant_metadata.txt \
 --o-visualization core_metrics_results_4250_depth/evenness_group_significance.qzv

qiime diversity alpha-group-significance \
 --i-alpha-diversity core_metrics_results_4250_depth/observed_features_vector.qza \
 --m-metadata-file infant_metadata.txt \
 --o-visualization core_metrics_results_4250_depth/observed_features_group_significance.qzv

qiime diversity alpha-group-significance \
 --i-alpha-diversity core_metrics_results_4250_depth/faith_pd_vector.qza \
 --m-metadata-file infant_metadata.txt \
 --o-visualization core_metrics_results_4250_depth/faith_pd_group_significance.qzv

# Calculate beta-diversity group significance
qiime diversity beta-group-significance \
 --i-distance-matrix core_metrics_results_4250_depth/bray_curtis_distance_matrix.qza \
 --m-metadata-file infant_metadata.txt --m-metadata-column feed \
 --o-visualization core_metrics_results_4250_depth/bray_curtis_feed_significance.qzv

qiime diversity beta-group-significance \
 --i-distance-matrix core_metrics_results_4250_depth/jaccard_distance_matrix.qza \
 --m-metadata-file infant_metadata.txt --m-metadata-column feed \
 --o-visualization core_metrics_results_4250_depth/jaccard_feed_significance.qzv

qiime diversity beta-group-significance \ 
 --i-distance-matrix core_metrics_results_4250_depth/weighted_unifrac_distance_matrix.qza \
 --m-metadata-file infant_metadata.txt \
 --m-metadata-column feed \
 --o-visualization core_metrics_results_4250_depth/weighted_unifrac_feed_significance.qzv

qiime diversity beta-group-significance \
 --i-distance-matrix core_metrics_results_4250_depth/unweighted_unifrac_distance_matrix.qza \
 --m-metadata-file infant_metadata.txt --m-metadata-column feed \
 --o-visualization core_metrics_results_4250_depth/unweighted_unifrac_feed_significance.qzv


# Generate PCoA plots
qiime emperor plot \
 --i-pcoa core_metrics_results_4250_depth/unweighted_unifrac_pcoa_results.qza \
 --m-metadata-file infant_metadata.txt \
 --o-visualization core_metrics_results_4250_depth/unweighted_unifrac_emperor

qiime emperor plot \
 --i-pcoa core_metrics_results_4250_depth/weighted_unifrac_pcoa_results.qza \
 --m-metadata-file infant_metadata.txt \
 --o-visualization core_metrics_results_4250_depth/weighted_unifrac_emperor

qiime emperor plot \
 --i-pcoa core_metrics_results_4250_depth/jaccard_pcoa_results.qza \
 --m-metadata-file infant_metadata.txt \
 --o-visualization core_metrics_results_4250_depth/jaccard_emperor

qiime emperor plot \
 --i-pcoa core_metrics_results_4250_depth/bray_curtis_pcoa_results.qza \
 --m-metadata-file infant_metadata.txt \
 --o-visualization core_metrics_results_4250_depth/bray_curtis_emperor


# ANCOM on QIIME
qiime metadata tabulate \
 --m-input-file taxonomy.qza \
 --o-visualization taxonomy.qzv
# Remove low-frequency samples
qiime feature-table filter-features \
 --i-table infant_only_table.qza \
 --p-min-frequency 10 \
 --o-filtered-table feature_frequency_infant_filtered_table.qza

# Generate taxonomic barplot
qiime taxa barplot
 --i-table feature_frequency_infant_filtered_table.qza
 --i-taxonomy taxonomy.qza
 --m-metadata-file infant_metadata.txt
 --o-visualization taxa-bar-plots.qzv

# Collapse filtered feature table to family level
qiime taxa collapse \
 --i-table feature_frequency_infant_filtered_table.qza \
 --i-taxonomy taxonomy.qza \
 --p-level 5 \
 --o-collapsed-table feature_frequency_infant_filtered_table_l5.qza

# Create composition artifact for differential abundance analysis
qiime composition add-pseudocount \
 --i-table feature_frequency_infant_filtered_table_l5.qza \
 --o-composition-table comp_frequency_infant_table_l5.qza

 # ANCOM on feed column
qiime composition ancom \
 --i-table comp_frequency_infant_table_l5.qza \
 --m-metadata-file infant_metadata.txt \
 --m-metadata-column feed \
 --o-visualization l5_ancom_feed.qzv





<Script5.r>
# l5_ancom_feed.qzv file exported as l5_ancom_feed.tsv in view.qiime2.org
# to add customizations
library(ggplot2)
library(EnvStats)
ancom <- read.table(file = "l5_ancom_feed.tsv", 
                    sep = '\t', 
                    header = TRUE)

# Plotting the ANCOM volcano plot
ggplot(ancom, aes(y = W, x = clr)) +
  geom_point() +
  theme_classic() +
  labs(x = "Centered Log Ratio (clr)", y = "W") +
  ylim(0, 100) +
  xlim(0,16)

# faith_pd_group_significance.qzv file exported as faith.tsv in view.qiime2.org
# to add customizations
faith <- read.table(file = "faith_significance.tsv",
                    sep = '\t',
                    header = TRUE)

# Plotting the Faith's PD boxplot
ggplot(faith, aes(y = faith_pd, x = feed)) +
  geom_boxplot() +
  stat_n_text() +
  theme_classic() +
  labs(x = "Feeding Method", y = "Faith's Phylogenetic Diversity Index")

# observed_features_group_significant.qzv file exported as 
# observed_feature_gs.tsv in view.qiime2.org
# to add customizations
observed_feature_gs <- read.table(file = "observed_feature_gs.tsv",
                    sep = '\t',
                    header = TRUE)

# Plotting the observed_feature_gs boxplot
ggplot(observed_feature_gs, aes(y = observed_features, x = feed)) +
  geom_boxplot() +
  stat_n_text() +
  theme_classic() +
  labs(x = "Feeding Method", y = "Observed Features")

# shannon_group_significant.qzv file exported as 
# shannon_gs.tsv in view.qiime2.org
# to add customizations
shannon_gs <- read.table(file = "shannon_gs.tsv",
                                  sep = '\t',
                                  header = TRUE)

# Plotting the shannon_gs boxplot
ggplot(shannon_gs, aes(y = shannon_entropy, x = feed)) +
  geom_boxplot() +
  stat_n_text() +
  theme_classic() +
  labs(x = "Feeding Method", y = "Shannon's Index")

# evenness_group_significance.qzv file exported as 
# evenness_gs.tsv in view.qiime2.org
# to add customizations
evenness_gs <- read.table(file = "evenness_gs.tsv",
                                  sep = '\t',
                                  header = TRUE)

# Plotting the evenness boxplot
ggplot(evenness_gs, aes(y = pielou_evenness, x = feed)) +
  geom_boxplot() +
  stat_n_text() +
  theme_classic() +
  labs(x = "Feeding Method", y = "Pielou's Evenness")