<Soil_QIIME2_script.txt>
#!/bin/bash

##############################

# user credentials for the server:
# User: root
# IP: 10.19.139.138
# Password: Biome625

############################## Create working directory in server and import data ############################## 

# move into the data folder of the server
cd data

# create directory for project and move into it
mkdir soil_project
cd soil_project

# import data into working directory (Make sure you are in the soil_project directory)
# this commands generates the demux.qza file
qiime tools import \  
    --type 'SampleData[SequencesWithQuality]' \  
    --input-path /mnt/datasets/project_2/soil/soil_manifest.txt \  
    --output-path demux.qza \  
    --input-format SingleEndFastqManifestPhred33V2
    
# create visualization for demux.qza file and visualize it
qiime demux summarize \  
    --i-data demux.qza \  
    --o-visualization demux.qzv
    
#copy metadata file from mnt folder to working directory
cp /mnt/datasets/project_2/soil/soil_metadata.txt soil_metadata.txt
    
# determine ASVs with DADA2
qiime dada2 denoise-single \
  --i-demultiplexed-seqs demux.qza \
  --p-trim-left 0 \
  --p-trunc-len 150 \
  --o-representative-sequences rep-seqs.qza \
  --o-table table.qza \
  --o-denoising-stats stats.qza
  
#visualize ASV statistics 
qiime feature-table summarize \
  --i-table table.qza \
  --o-visualization table.qzv \
  --m-sample-metadata-file soil_metadata.txt
  
  
############################## Do metadata filtering using qiime filter-table, filter-features and filter-samples command and generate table.qzv visualization ##############################   

#Filter out low frequency reads

qiime feature-table filter-features \
  --i-table table.qza \
  --p-min-frequency 10 \
  --o-filtered-table frequency_filter_table.qza

#Filter by metadata samples and select for samples only in BC
qiime feature-table filter-samples \
  --i-table frequency_filter_table.qza
  --m-metadata-file soil_metadata.txt \
  --p-where "[Region] IN ('British Columbia')" \
  --o-filtered-table freqeuncy_filter_BC_table.qza
  

# generate the visualization for the table.qza
qiime feature-table summarize \
  --i-table frequency_filter_BC_table.qza \
  --o-visualization filtered_table.qzv \
  --m-sample-metadata-file soil_metadata.txt

############################## Generate tree for phylogenetic analyses using the qiime-phylogeny command (rooted-tree.qza and unrooted-tree.qza) ##############################    

qiime phylogeny align-to-tree-mafft-fasttree \
  --i-sequences rep-seqs.qza \
  --o-alignment aligned-rep-seqs.qza \
  --o-masked-alignment masked-aligned-rep-seqs.qza \
  --o-tree unrooted-tree.qza \
  --o-rooted-tree rooted-tree.qza

############################## Generate alpha rarefaction visualization using the qiime diversity command. ##############################   

#Using the visualization, determine if the diversity for OM treatments is able to plateau and at which sequencing #depth. Select an adequate sequencing depth to be used in downstream alpha and beta diversity plots.

# alpha rarefaction only including BC
qiime diversity alpha-rarefaction \
  --i-table frequency_filter_BC_table.qza \
  --i-phylogeny rooted-tree.qza \
  --p-max-depth 9276 \
  --m-metadata-file soil_metadata.txt \
  --o-visualization alpha-rarefaction.qzv
  
# alpha-rarefaction including all regions
qiime diversity alpha-rarefaction \
  --i-table frequency_filter_table.qza \
  --i-phylogeny rooted-tree.qza \
  --p-max-depth 9276 \
  --m-metadata-file soil_metadata.txt \
  --o-visualization alpha-rarefaction_all_regions.qzv
  
#Ran the rarefaction again but with a different max-depth
qiime diversity alpha-rarefaction \
  --i-table frequency_filter_BC_table.qza \
  --i-phylogeny rooted-tree.qza \
  --p-max-depth 6000 \
  --m-metadata-file soil_metadata.txt \
  --o-visualization alpha-rarefaction.qzv
  
############################## Training a taxonomic classifier and classifying ASVs ##############################   

# dowload the Greenges 97% version Database
wget "ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz"

#Unzip the database
gunzip gg_13_8_otus.tar.gz
tar -xvf gg_13_8_otus.tar

#obtain 16S sequences
qiime tools import \
  --type 'FeatureData[Sequence]' \
  --input-path gg_13_8_otus/rep_set/97_otus.fasta \
  --output-path ref-otus.qza

#obtain the associated taxonomic information
qiime tools import \
  --type 'FeatureData[Taxonomy]' \
  --input-format HeaderlessTSVTaxonomyFormat \
  --input-path gg_13_8_otus/taxonomy/97_otu_taxonomy.txt \
  --output-path ref-taxonomy.qza

# extract the sequences between the primers from the 16S sequences in the database
qiime feature-classifier extract-reads \
  --i-sequences ref-otus.qza \
  --p-f-primer AGAGTTTGATCMTGGCTCAG \
  --p-r-primer GWATTACCGCGGCKGCTG \
  --p-trunc-len 150 \
  --o-reads ref-seqs.qza

# Train classifier based on the 16S reads
qiime feature-classifier fit-classifier-naive-bayes \
  --i-reference-reads ref-seqs.qza \
  --i-reference-taxonomy ref-taxonomy.qza \
  --o-classifier classifier.qza

# classify your ASVs
qiime feature-classifier classify-sklearn \
  --i-classifier classifier.qza \
  --i-reads rep-seqs.qza \
  --o-classification taxonomy.qza

# generate visualization of the taxonomy file
qiime metadata tabulate \
  --m-input-file taxonomy.qza \
  --o-visualization taxonomy.qzv

# generate taxonomy barplot
qiime taxa barplot \
  --i-table table.qza \
  --i-taxonomy taxonomy.qza \
  --m-metadata-file soil_metadata.txt \
  --o-visualization taxa-bar-plots.qzv

############################## Filter mitochondrial and chloroplast sequences using defined taxonomy classification ##############################   

# filter mitochondrial and chloroplast sequences
qiime taxa filter-table \
  --i-table frequency_filter_BC_table.qza \
  --i-taxonomy taxonomy.qza \
  --p-exclude mitochondria,chloroplast \
  --o-filtered-table frequency_filter_BC_no_mitoc_chloroplast_table.qza

############################## Generate the core metric directory (calculate alpha- and beta-diversity metrics) with filtered table and selected sequencing depth ##############################   

# command for core-metrics-results directory
qiime diversity core-metrics-phylogenetic \
  --i-phylogeny rooted-tree.qza \
  --i-table frequency_filter_BC_no_mitoc_chloroplast_table.qza \
  --p-sampling-depth 6041 \
  --m-metadata-file soil_metadata.txt \
  --output-dir core-metrics-results
  
  
# calculate beta-diversity significance for unweighted Unifrac based on LTSP treatment
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results/unweighted_unifrac_distance_matrix.qza \
  --m-metadata-file soil_metadata.txt \
  --m-metadata-column "LTSP Treatment" \
  --o-visualization core-metrics-results/unweighted-unifrac-LTSP-group-significance.qzv \
  --p-pairwise
  

# calculate beta-diversity significance for weighted Unifrac based on LTSP treatment
qiime diversity beta-group-significance \
  --i-distance-matrix core-metrics-results/weighted_unifrac_distance_matrix.qza \
  --m-metadata-file soil_metadata.txt \
  --m-metadata-column "LTSP Treatment" \
  --o-visualization core-metrics-results/weighted-unifrac-LTSP-group-significance.qzv \
  --p-pairwise
  
# calculate alpha diversity metrics for the data

# calculate Faiths alpha diversity metric
qiime diversity alpha-group-significance \
  --i-alpha-diversity core-metrics-results/faith_pd_vector.qza \
  --m-metadata-file soil_metadata.txt \
  --o-visualization core-metrics-results/faith-pd-group-significance.qzv

# calculate Pielou's eveness metric
qiime diversity alpha-group-significance \
  --i-alpha-diversity core-metrics-results/evenness_vector.qza \
  --m-metadata-file soil_metadata.txt \
  --o-visualization core-metrics-results/evenness-group-significance.qzv
  
# calculate total observed features (ASVs) metric
qiime diversity alpha-group-significance \
  --i-alpha-diversity core-metrics-results/observed_features_vector.qza \
  --m-metadata-file soil_metadata.txt \
  --o-visualization core-metrics-results/observed_features_significance.qzv

# calculate shannon diversity metric
qiime diversity alpha-group-significance \
  --i-alpha-diversity core-metrics-results/shannon_vector.qza \
  --m-metadata-file soil_metadata.txt \
  --o-visualization core-metrics-results/shannon_significance.qzv

############################## Export QIIME2 files and combine files into a .biom file ##############################


#make new directory for exported BIOM files
mkdir exported

#export BIOM files
qiime tools export \
  --input-path table.qza \
  --output-path exported 
qiime tools export \
  --input-path taxonomy.qza \
  --output-path exported 
qiime tools export 
  --input-path rooted-tree.qza \
  --output-path exported 

#open file in text editor on server
nano exported/taxonomy.tsv

#edit the column names and change
#'Feature ID' to '#OTUID'
#'Taxon' to 'taxonomy'
#'Confidence' to 'confidence'

#combine files into .biom file
biom add-metadata \
  -i exported/feature-table.biom \
  -o exported/table-with-taxonomy.biom \
  --observation-metadata-fp exported/taxonomy.tsv \
  --sc-separated taxonomy


<exploratory_analysis.Rmd>
---
output:
  pdf_document: default
  html_document: default
---
# Load all the packages needed:

```{r}
library(readr)
library(dplyr)
library(tidyverse)
library(readxl)
library(lubridate)
library(chron)
library(ggsci)
library(grid)
library(gridExtra)
```


# 1. Load the metadata from soil 
```{r}
soil_metadata <- read_excel("soil_metadata_reformatted.xlsx", na = "NA") %>%
  mutate(LTSP = as.factor(LTSP)) %>%
  mutate(Elevation = as.factor(Elevation)) 
soil_metadata

```

# 2. look at the different regions (were samples were taken from) in relationship to Carbon and Nitrogen
```{r}
soil_metadata %>%
  ggplot(aes(x = Total_Carbon, y = Total_Nitrogen, color = Region)) +
    geom_jitter() +
    labs(x ="Total Carbon", y = "Total Nitrogen")
```


# filter to only include samples in British Columbia and looks at Total Carbon and Total nitrogen in relationship to LTSP treatment (REF, OM1, OM2, OM3)
```{r}
soil_metadata %>%
  filter(Region == "British Columbia") %>%
  ggplot(aes(x = Total_Carbon, y = Total_Nitrogen, color = LTSP)) +
    geom_jitter(width = 0.5, height = 0.1) +
    labs(x ="Total Carbon", y = "Total Nitrogen")
```


# filter data to include only samples from one OM1,2 or Ref and compare it to OM3 (this is so that we can easily visualize if there are any visible distinctions)

Comparison between Ref and OM3
```{r}
soil_metadata %>%
  filter(Region == "British Columbia") %>%
  filter(LTSP %in% c("REF", "OM3")) %>%
  ggplot(aes(x = Total_Carbon, y = Total_Nitrogen, color = LTSP)) +
    geom_jitter(width = 0.5, height = 0.1) +
    labs(x ="Total Carbon", y = "Total Nitrogen")
```

Comparison between OM1 and OM3
```{r}
soil_metadata %>%
  filter(Region == "British Columbia") %>%
  filter(LTSP %in% c("OM1", "OM3")) %>%
  ggplot(aes(x = Total_Carbon, y = Total_Nitrogen, color = LTSP)) +
    geom_jitter(width = 0.5, height = 0.1) +
    labs(x ="Total Carbon", y = "Total Nitrogen")
```

Comparison between OM1 and OM3
```{r}
soil_metadata %>%
  filter(Region == "British Columbia") %>%
  filter(LTSP %in% c("OM2", "OM3")) %>%
  ggplot(aes(x = Total_Carbon, y = Total_Nitrogen, color = LTSP)) +
    geom_jitter(width = 0.5, height = 0.1) +
    labs(x ="Total Carbon", y = "Total Nitrogen")
```

```{r}
soil_metadata %>%
  filter(Region == "British Columbia") %>%
  ggplot(aes(x = Moisture_content, y = Soil_Bulk_Density, color = LTSP)) +
    geom_jitter() +
    labs(x ="moisture content", y = "Soil bulk density")
```


# look at total carbon and total nitrogen in relationship to soil horizon

Comparison between OM1 and OM3
```{r}
soil_metadata %>%
  filter(Region == "British Columbia") %>%
  ggplot(aes(x = Total_Carbon, y = Total_Nitrogen, color = Horizon)) +
    geom_jitter(width = 0.5, height = 0.1) +
    facet_wrap(~ LTSP) +
    labs(x ="Total Carbon", y = "Total Nitrogen")

```


<PCoA_plots.Rmd>
# Load CRAN packages
```{r}
library(tidyverse)
library(vegan)
```

# Load Bioconductor packages
```{r}
library(phyloseq)
library(DESeq2)
library(tidyr)
library(ape)
```

**Note: Before doing the following analyses, make sure that you have exported the table.qza, taxonomy.qza, metadata and rooted-tree files into you working directory**
- for instructions on how to do this, refer to the QIIME2 working script

# Export biom file and tree from QIIME2 and provide original metadata file
```{r}
biom_file <- import_biom("exported/table-with-taxonomy.biom")
metadata  <- import_qiime_sample_data("soil_metadata.txt")
tree      <- read_tree_greengenes("exported/tree.nwk")
tree <- multi2di(tree) # assuming you read in your tree.nwk file as the variable tree
```

# Combine all information into a single phyloseq object
```{r}
physeq <-  merge_phyloseq(biom_file, metadata, tree)
```

```{r}
# look at sample metadata
head(sample_data(physeq))

# downstream steps may use randomizers so we set.seet for reproducibility
set.seed(711)

# taxonomic rank names
rank_names(physeq) #kingdom class and all of that, however you can change the name

# rename the ranks
colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
head(tax_table(physeq))
```

# filter data based on metadata category (British Columbia)
```{r}
# filter data on metadata
filtered_phyloseq <- subset_samples(physeq, Region == "British Columbia")
filtered_phyloseq

#format data as table with as.data.frame()
as.data.frame(sample_names(filtered_phyloseq))

#show read count for each sampl
```

# beta diversity plots
```{r}
# diversity analysis requires rarefied taxa tables
# we know from Qiime that we should rarefy to 6041, so right palm samples level out
physeq_rar <- rarefy_even_depth(filtered_phyloseq, sample.size = 6041)

# convert to RA (relative abundance)
physeq_rar_RA <- transform_sample_counts(physeq_rar, function(x) x/sum(x)) 

# use our data to do beta-diversity analysis
# we define the type of analysis using the ordinate() function and setting the method = PCoA, setting distance to type 
# of beta diversity analysis
# you can change analysis to jaccard, bray-curtis and so on
ord <- ordinate(physeq_rar_RA, method = "PCoA", distance = "wunifrac")

# plot the data based on OM removal treatments and soil horizon
plot_ordination(physeq_rar_RA, ord, 
                type = "sample", 
                color = "LTSP.Treatment", 
                title = "PCoA (weighted UniFrac)",
                shape = "Horizon") +

# manually adjust colors for points
scale_colour_manual(values = c("blue", "red", "black", "orange")) +
                    #labels = c("REF", "Left palm", "Right palm", "Tongue")) +  # change the labels
  
# add elipse automatically
  stat_ellipse(type = "norm", size = 1) +
  
# how to change the legend title
  guides(color = guide_legend("LTSP Treatment")) +
  
# add theme
  theme_bw(base_size = 14)
```

# plot the data based on OM treatment == "REF"
```{r}
# plot the data based on OM treatment == "REF"
plot_ordination(subset_samples(physeq_rar_RA, LTSP.Treatment == "REF"), ord, 
                type = "sample", 
                color = "LTSP.Treatment", 
                title = "PCoA (weighted UniFrac)",
                shape = "Horizon") +

# manually adjust colors for points
scale_colour_manual(values = c("orange")) +
  
# add elipse automatically
  stat_ellipse(type = "norm", size = 1) +
  
# how to change the legend title
  guides(color = guide_legend("LTSP Treatment")) +
  
# add theme
  theme_bw(base_size = 14)
```

# plot the data based on OM treatment == "OM1"
```{r}
# plot the data based on OM treatment == "REF"
plot_ordination(subset_samples(physeq_rar_RA, LTSP.Treatment == "OM1"), ord, 
                type = "sample", 
                color = "LTSP.Treatment", 
                title = "PCoA (weighted UniFrac)",
                shape = "Horizon") +

# manually adjust colors for points
scale_colour_manual(values = c("blue")) +
  
# add elipse automatically
  stat_ellipse(type = "norm", size = 1) +
  
# how to change the legend title
  guides(color = guide_legend("LTSP Treatment")) +
  
# add theme
  theme_bw(base_size = 14)
```

# plot the data based on OM treatment == "OM2"
```{r}
# plot the data based on OM treatment == "REF"
plot_ordination(subset_samples(physeq_rar_RA, LTSP.Treatment == "OM2"), ord, 
                type = "sample", 
                color = "LTSP.Treatment", 
                title = "PCoA (weighted UniFrac)",
                shape = "Horizon") +

# manually adjust colors for points
scale_colour_manual(values = c("red")) +
  
# add elipse automatically
  stat_ellipse(type = "norm", size = 1) +
  
# how to change the legend title
  guides(color = guide_legend("LTSP Treatment")) +
  
# add theme
  theme_bw(base_size = 14)
```

# plot the data based on OM treatment == "OM3"
```{r}
# plot the data based on OM treatment == "REF"
plot_ordination(subset_samples(physeq_rar_RA, LTSP.Treatment == "OM3"), ord, 
                type = "sample", 
                color = "LTSP.Treatment", 
                title = "PCoA (weighted UniFrac)",
                shape = "Horizon") +

# manually adjust colors for points
scale_colour_manual(values = c("black")) +
  
# add elipse automatically
  stat_ellipse(type = "norm", size = 1) +
  
# how to change the legend title
  guides(color = guide_legend("LTSP Treatment")) +
  
# add theme
  theme_bw(base_size = 14)
```


# plot the data based on other variables (Mean annual precipitation)
```{r}
# plot the data
physeq_rar_RA %>%

plot_ordination(ord, 
                type = "sample", 
                color = "Mean.Annual.Precipitation..mm.", 
                title = "PCoA (weighted UniFrac)") +
  
# how to change the legend title
  guides(color = guide_legend("Mean annual precipitation")) +
  
# add theme
  theme_bw(base_size = 14)
```

# plot the data based on other variables (Ecozone)
```{r}
# plot the data
physeq_rar_RA %>%

plot_ordination(ord, 
                type = "sample", 
                color = "Ecozone", 
                title = "PCoA (weighted UniFrac)") +
  
# how to change the legend title
  guides(color = guide_legend("Ecozone")) +
  
# add theme
  theme_bw(base_size = 14)
```

# plot the data based on other variables (Ecozone)
```{r}
# plot the data
physeq_rar_RA %>%

plot_ordination(ord, 
                type = "sample", 
                color = "Site", 
                title = "PCoA (weighted UniFrac)") +
  
# how to change the legend title
  guides(color = guide_legend("Ecozone")) +
  
# add theme
  theme_bw(base_size = 14)
```

#PCoA plot of all the regions (BC, Texas, Ontario, California; with no filters)

```{r}
# diversity analysis requires rarefied taxa tables
# we know from Qiime that we should rarefy to 6041, so right palm samples level out
physeq_rar <- rarefy_even_depth(physeq, sample.size = 6000)

# convert to RA (relative abundance)
physeq_rar_RA <- transform_sample_counts(physeq_rar, function(x) x/sum(x)) 

# use our data to do beta-diversity analysis
# we define the type of analysis using the ordinate() function and setting the method = PCoA, setting distance to type 
# of beta diversity analysis
# you can change analysis to jaccard, bray-curtis and so on
ord <- ordinate(physeq_rar_RA, method = "PCoA", distance = "wunifrac")

# plot the data
plot_ordination(physeq_rar_RA, ord, 
                type = "sample", 
                color = "Region", 
                title = "PCoA (weighted UniFrac)") +

# manually adjust colors for points
scale_colour_manual(values = c("blue", "red", "black", "orange")) +
  
# add elipse automatically
  stat_ellipse(type = "norm", size = 1) +
  
# how to change the legend title
  guides(color = guide_legend("Region")) #+
  
# add theme
  #theme_bw(base_size = 14)

```


<logistic_regression_analysis.Rmd>
In this script we will look at doing logistical regression analysis of alpha-diversity metrics
to control for confounding variables that may be affecting the diversity.

1. We load the data from alpha diversity metrics donwloaded from the QIIME2 view online sofware. We do this by inputing the resultinf files from the 'alpha-diversity significance' commands in QIIME2 into the QIIME2 view software. (Refer to the QIIME2 working script for more details on this)
  - Shannon diversity
  - observed OTUs
  - Pielous eveness
  - Faith metric

we do this by doing putting the alpha diversity .qzv files that we previously calculated (see pg X of lab book)into qiime view online software and then downloading the results in TSVs formats and loading them into R

to start, we first load the necessary libraries
```{r}
library(dplyr)
library(tidyr)
library(ggplot2)
library(tidyverse)
```

**now we load the data into R and merge the different alpha diversity metrics and the metadata into one table object**
NOTE: the .tsv files downlaoded from the qiime table already contain some categorical columns, but they omit numerical ones. In our regression analysis we are going to use both of them. Therefore we need to load the metadata and add some of this numerical coumns into our final object

```{r}
# load each of the tables independently
shannon <- read_tsv("shannon.tsv") # we take out the first row which have the categorizations based on qiime
observed <- read_tsv("observed_features.tsv")
faiths <- read_tsv("faiths.tsv")
evenness <- read_tsv("evenness.tsv")

# load the metadata
metadata <- read_csv("soil_metadata.csv") %>%
  rename(id = `#SampleID`) # rename the #SampleID column to a column name that we can math to other data frames

#merge the tables into a table that contains the the alpha-diversity metrics and their sample id
alpha_metrics <- bind_cols(shannon[,c(1, 18)],
                           observed[,18],
                           faiths[,18],
                           evenness[,18]) %>%
                as.data.frame() # this converts the table into a data.frame

# join the alpha_metrics with the metadata based on their sample id
alpha_metrics <- right_join(metadata, alpha_metrics, by = "id") 

```


2. now using the weighted unifract beta diversity plot, we highligh the variables that have clustering patterns and we also look 
at which of them are highly correlated (or have the same clustering patterns), such that we only account for one of them in our data


**we look at the structure of the data and see if it has been categorized corrrectly**
```{r}
str(alpha_metrics)
```

**We need to do two things to prepare this data for analysis**
  1. we need to filter the columns that will be useful in our analysis. The filtering parameters are as follows:
      - They need to have more than one group in the variable
      - There cannot be two variables that are highly correlated with each other (we check this using the beta diversity plots and
      the alias() function in R)
      - we cannot use data with missing values in our model since these values will be dropped

  2. change the columns that are read as "chr" types to "factor"
  
```{r}
# we are now selecting for only the metrics that we will be using in our analysis using the select() function
alpha_metrics <- alpha_metrics %>%
  select(Ecozone,             #where samples were collected (same as climatic zone)
         `Collection Date`,   #the date they were collected
         Site,                #the place in BC they were collected (same as with Tree cover, longitude, latitude, and elevation)
         Horizon,             # the soil horizon (same as sampling depth)
         `LTSP Treatment`,    # the LTSP treatment
         `Mean Annual Temperature (Celsius)`, # The mean annual temperature of the colleciton site
         `Soil Classification`, # the type of soil
         `Compaction Treatment`, # how the soild was compacted
         pH,                  # note that there are missing values!
         `Total Carbon`,      # note that there are missing values! 
         `Total Nitrogen`,    # note that there are missing values!
         `Soil Bulk Density`, # note that there are missing values!
         `CN Ratio`,          # note that there are missing values!
         `Moisture Content`,   # No missing values!!!
         shannon_entropy,
         observed_features,
         pielou_evenness,
         faith_pd)          

# now we change the data type in each column to represent "chr" columns as "factor" type
alpha_metrics <- alpha_metrics %>%
  mutate(Ecozone = as.factor(Ecozone), # 2 ezocones
         `Collection Date` = as.factor(`Collection Date`),
         Horizon = as.factor(Horizon),
         `LTSP Treatment` = as.factor(`LTSP Treatment`),
         Site = as.factor(Site),       # 6 collection "Site"
         `Mean Annual Temperature (Celsius)` = as.factor(`Mean Annual Temperature (Celsius)`),
         `Compaction Treatment` = as.factor(`Compaction Treatment`),
         `Soil Classification` = as.factor(`Soil Classification`)
        )

#we will also rename some variable to make them easier to manage
alpha_metrics <- alpha_metrics %>%
  rename(Date = `Collection Date`,
         Treatment = `LTSP Treatment`,
         Annual_temperature = `Mean Annual Temperature (Celsius)`,
         Soil_classification = `Soil Classification`,
         Compaction = `Compaction Treatment`) 

# we check to see that we corrected the types and renamed the columns correctly
str(alpha_metrics)
```


3. now we can do our logistical regression analysis of the alpha-diversity metrics

- shannon diversity with only categorical variables! (we will use the glm() and summary() R based functions to do this)
```{r}
# calculate the shannon regression and the summary

shannon_regression <- glm(shannon_entropy ~ Ecozone  + Horizon  + Treatment + Site + Compaction + `Moisture Content`, data = alpha_metrics)
summary(shannon_regression)
```


Calculate the R^2 value of the shannon model
```{r}
# calculate the R^2 value of the model
ll.null <- shannon_regression$null.deviance/-2
ll.proposed <- shannon_regression$deviance/-2
(ll.null - ll.proposed)/ll.null
```

Plot the data for shannon diversity
```{r}
# create a new data frame object containing the predicted alpha diversity values and metadata categories
predicted.data.shannon <- data.frame(
  probability.of.shannon = predict(shannon_regression, type = "response"),
  shannon_entropy = alpha_metrics$shannon_entropy,
  Ecozone = alpha_metrics$Ecozone,
  Horizon = alpha_metrics$Horizon, 
  Treatment = alpha_metrics$Treatment,
  Site = alpha_metrics$Site,
  Compaction = alpha_metrics$Compaction,
  Temperature = alpha_metrics$Annual_temperature,
  Date = alpha_metrics$Date,
  pH = alpha_metrics$pH,
  Carbon = alpha_metrics$`Total Carbon`,
  Nitrogen = alpha_metrics$`Total Nitrogen`,
  Moisture = alpha_metrics$`Moisture Content`
)

# rearrange the LTSP treatments to order the plot
predicted.data.shannon$Treatment <- factor(predicted.data.shannon$Treatment, levels = c("REF","OM1", "OM2", "OM3"))

# plot Treatment vs the predicted values
ggplot(predicted.data.shannon, aes(x= Treatment, y= probability.of.shannon)) +
  geom_boxplot(color = "black",fill='#A4A4A4') +
  geom_jitter(position = position_jitter(0.2)) +
  scale_y_continuous(limits = c(5.4, 7.0)) +
  labs(x = "LTSP treatment", y = "Corrected Shannon Value") +
  theme_classic()

```

one way anova test
```{r}
anova_shannon <- aov(probability.of.shannon ~ Treatment, data = predicted.data.shannon)
summary(anova_shannon)
```


- model for faith alpha diversity metric
```{r}
regression_faiths <- glm(faith_pd ~ (Ecozone  + Horizon + Treatment + Site + Compaction + `Moisture Content`), data = alpha_metrics)
summary(regression_faiths)

```

Calculate the R^2 value
```{r}
ll.null <- regression_faiths$null.deviance/-2
ll.proposed <- regression_faiths$deviance/-2
(ll.null - ll.proposed)/ll.null
```


plot the data
```{r}
# create a new data frame object containing the predicted alpha diversity values and metadata categories
predicted.data.faiths <- data.frame(
  probability.of.faith = predict(regression_faiths, type = "response"),
  Faith_pd = alpha_metrics$faith_pd,
  Ecozone = alpha_metrics$Ecozone,
  Horizon = alpha_metrics$Horizon, 
  Treatment = alpha_metrics$Treatment,
  Site = alpha_metrics$Site,
  Compaction = alpha_metrics$Compaction
)

# rearrange the LTSP treatments to order the plot
predicted.data.faiths$Treatment <- factor(predicted.data.faiths$Treatment, levels = c("REF","OM1", "OM2", "OM3"))

# plot Treatment vs the predicted values
ggplot(predicted.data.faiths, aes(x= Treatment, y= probability.of.faith)) +
  geom_boxplot(color = "black",fill='#A4A4A4') +
  geom_jitter(position = position_jitter(0.2)) +
  scale_y_continuous(limits = c(14, 44)) +
  labs(x = "LTSP treatment", y = "Controlled Faiths Value") +
  theme_classic() 

```

make an ANOVA comparison 
```{r}
anova_faiths <- aov(probability.of.faith ~ Treatment, data = predicted.data.faiths)
summary(anova_faiths)
```


- regression model for pielou's evenness metric
```{r}
regression_pielou <- glm(pielou_evenness ~ (Ecozone  + Horizon + Treatment + Site + Compaction + `Moisture Content`), data = alpha_metrics)
summary(regression_pielou)

```

Calculate R^2 value for model
```{r}
ll.null <- regression_pielou$null.deviance/-2
ll.proposed <- regression_pielou$deviance/-2
(ll.null - ll.proposed)/ll.null
```

```{r}
predicted.data.pielou <- data.frame(
  probability.of.evenness = predict(regression_pielou, type = "response"),
  evenness = alpha_metrics$pielou_evenness,
  Ecozone = alpha_metrics$Ecozone,
  Horizon = alpha_metrics$Horizon, 
  Treatment = alpha_metrics$Treatment,
  Site = alpha_metrics$Site,
  Compaction = alpha_metrics$Compaction
)

# rearrange the LTSP treatments to order the plot
predicted.data.pielou$Treatment <- factor(predicted.data.pielou$Treatment, levels = c("REF","OM1", "OM2", "OM3"))

# plot Treatment vs the predicted values
ggplot(predicted.data.pielou, aes(x= Treatment, y= probability.of.evenness)) +
  geom_boxplot(color = "black",fill='#A4A4A4') +
  geom_jitter(position = position_jitter(0.2)) +
  scale_y_continuous(limits = c(0.86, 0.96)) +
  labs(x = "LTSP treatment", y = "Controlled Evenness Value") +
  theme_classic() 

```

one way anova analysis of predicted values
```{r}
anova_pielou <- aov(probability.of.evenness ~ Treatment, data = predicted.data.pielou)
summary(anova_pielou)
```


- regression model for observed_features
```{r}
# calculate the regression model using glm() and then use summary() to see the coefficients
regression_observed <- glm(observed_features ~ (Ecozone  + Horizon + Treatment + Site + Compaction + `Moisture Content`), data = alpha_metrics)
summary(regression_observed)

```

Calculate the R^2 value of the model
```{r}
ll.null <- regression_observed$null.deviance/-2
ll.proposed <- regression_observed$deviance/-2
(ll.null - ll.proposed)/ll.null
```

Plot the data
```{r}
predicted.data.observed <- data.frame(
  probability.of.features = predict(regression_observed, type = "response"),
  features = alpha_metrics$observed_features,
  Ecozone = alpha_metrics$Ecozone,
  Horizon = alpha_metrics$Horizon, 
  Treatment = alpha_metrics$Treatment,
  Site = alpha_metrics$Site,
  Compaction = alpha_metrics$Compaction
)


# rearrange the LTSP treatments to order the plot
predicted.data.observed$Treatment <- factor(predicted.data.observed$Treatment, levels = c("REF","OM1", "OM2", "OM3"))

# plot Treatment vs the predicted values
ggplot(predicted.data.observed, aes(x= Treatment, y= probability.of.features)) +
  geom_boxplot(color = "black",fill='#A4A4A4') +
  geom_jitter(position = position_jitter(0.2)) +
  scale_y_continuous(limits = c(60, 220)) +
  labs(x = "LTSP treatment", y = "Controlled Oberseved Features Value") +
  theme_classic() 

```

one way anova analysis of predicted observed features
```{r}
anova_observed <- aov(probability.of.features ~ Treatment, data = predicted.data.observed)
summary(anova_observed)
```


<indicator_taxa.Rmd>
The following script is for an indicator taxa analysis to look at taxa in specific OM treatments that are not found in others

**Load the packages needed**
```{r}
library(dplyr)
library(phyloseq)
library(indicspecies)
library(ape)
library(ggplot2)
library(forcats)
library(tidyverse)
library(broom)
library(gridExtra)
```


**Function to group ASVs table by higher order taxonomy (By Ilan Rubin)**
```{r}
group_by_taxonomy = function(asv_table, taxonomy, rank){
  asv_table = as.data.frame(asv_table)
  taxonomy = as.data.frame(taxonomy)
  taxonomy$ASV = rownames(taxonomy)
  asv_table$ASV = rownames(asv_table)
  asv_table = inner_join(taxonomy,asv_table,by="ASV")
  asv_table$taxa = apply(asv_table[,1:rank],1,paste,collapse=" ")
  asv_table = asv_table[,-(1:8)]
  asv_table = group_by(data.frame(asv_table),taxa)
  taxa_table = as.data.frame(summarise_all(asv_table,sum))
  rownames(taxa_table) = taxa_table$taxa
  return(taxa_table[,-1])
}
```

**Load the data**
This step assumes you have run the 'qiime export' commands previously and have the downloaded files. Refer to the QIIME2 working script if you need to see this commands.
```{r}
biom = import_biom("exported/table-with-taxonomy.biom")
metadata  <- import_qiime_sample_data("soil_metadata.txt")
tree      <- read_tree_greengenes("exported/tree.nwk")
tree <- multi2di(tree) # assuming you read in your tree.nwk file as the variable tree
```


**Create phyloseq object and filter it based on metadata**

```{r}
physeq <-  merge_phyloseq(biom, metadata, tree)

# filter data on metadata
filtered_phyloseq <- subset_samples(physeq, Region == "British Columbia")
```

**subset the data into tables needed for our analysis**
```{r}
taxa_table <- otu_table(filtered_phyloseq)
taxonomy <- tax_table(filtered_phyloseq)
metadata <- sample_data(filtered_phyloseq)
```


**Create the taxa table for the rank of taxonomy required**
Kingdom=1, Phylum=2, Class=3, Order=4, Family=5, Genus=6, Species=7
```{r}
taxa_table = group_by_taxonomy(taxa_table, taxonomy, 3)
```

### Calculate indicator values
```{r}
indicator_multipatt = multipatt(t(taxa_table),metadata$LTSP.Treatment,duleg=T)
summary(indicator_multipatt)


indicator_output = capture.output(summary(indicator_multipatt,indvalcomp=TRUE))
write.table(indicator_output,file="indiciator_values.txt",row.names=F,quote=F)
```


# taxonomic analysis of selected taxonomic classes

**Import the taxonomic species data from taxonomy-bar plot**
```{r}
species_data <- read_csv("level-7.csv")
head(species_data)
```

**import function that determines relative abundance analysis at a certain taxonomic grouping**
```{r}
source("RelativeTaxa.R")
```

**make the relative abundance analysis of the classified ASVs for each sample**
Data is grouped and the family taxonomic level (taxa_level = 3)
```{r}
dat <- RelativeData(species_data, merge = TRUE, taxa_level = 3, Metadata = c(451:length(species_data)))
  # merge means that it will group ASVs at a higher taxonomy order
  # taxa_level refers to the taxonomic leve. In this case it is "Class"
  # metadata is just the columns in the metadata should not be taking into consideration in the analysis
```

**find the metadata columns**
```{r}
metadata_names <- colnames(species_data)[451: length(species_data)]
metadata_names

```

**Write a vector with the classes from the indicator taxa analysis that were significant and that have an adequate classification**
```{r}
significant_results <- c("c__Alphaproteobacteria",
                         "c__Ellin6529",
                         "c__Gemmatimonadetes",
                         "c__Bacilli",
                         "c__P2-11E",
                         "c__Chloroflexi",
                         "c__Betaproteobacteria",
                         "c__iii1-8",
                         "c__Nitrospira",
                         "c__TK10",
                         "c__[Chloracidobacteria]",
                         "c__Gitt-GS-136",
                         "c__Ktedonobacteria",
                         "c__Solibacteres",
                         "c__Gammaproteobacteria",
                         "c__Acidobacteriia",
                         "c__Actinobacteria",
                         "c__Sphingobacteriia",
                         "c__Cytophagia",
                         "c__Opitutae",
                         "c__Chthonomonadetes",
                         "c__Armatimonadia",
                         "c__Verrucomicrobiae")

```


**now we need to run our significant results through a logistic regression model to account for other confounding variables and determine in which models OM treatments are significant in explaining variation in abundance **

this step just renames some categorical variables to make it easier
```{r}
# rename some of the categorical variables to make them easier to use
dat <- dat %>%
  rename(Date = `Collection Date`,
         Treatment = `LTSP Treatment`,
         Annual_temperature = `Mean Annual Temperature (Celsius)`,
         Soil_classification = `Soil Classification`,
         Compaction = `Compaction Treatment`) 
```

**In this step we create a "for" loop that stablishes which of the taxonomic classes from our indicator taxa analysis is significanly explained by OM removal treatment when controlling for other confounding variables** 
```{r}

# create data frame with corrected relative abundance values
dat_corrected <- dat %>%
  select(index, Date, Treatment, Ecozone, Horizon, Treatment, Site, Compaction, `Moisture Content`)

# Make a for loop that checks which of the taxonomic classes from the indicator taxa analysis 
# is significantly explained by OM removal treatment and stored the predicted values from the 
# logistic regression model in a dataframe
filtered_significant_results <- vector(mode = "character")
for (i in significant_results){
  x <- dat[i] %>% unlist
  corrected_values <- predict(glm(x ~ Ecozone  + Horizon  + Treatment + Site + Compaction + `Moisture Content`, 
                data = dat), type = "response")
  LR <- tidy(glm(x ~ Ecozone  + Horizon  + Treatment + Site + Compaction + `Moisture Content`, 
                data = dat))
  if (LR[4, 5] <= 0.05 | LR[5,5] <= 0.05 | LR[6,5] <= 0.05){
  filtered_significant_results <- append(filtered_significant_results, i)  
  
  dat_corrected[i] <- corrected_values # adds the corrected values to our data frame
  }
}
dat_corrected
```


Create boxplots of a subset of significant classes
```{r}
# assign order in which boxplots will be plotted
dat_corrected$Treatment <- factor(dat$Treatment, levels = c("REF", "OM1", "OM2", "OM3"))


a1 <- ggplot(dat_corrected, aes(x= Treatment, y = c__Alphaproteobacteria, fill = Treatment)) + 
              geom_boxplot() +
              labs(x = "OM removal treatment", y = "Predicted Relative Abundance", title = str_sub("c__Alphaproteobacteria",4 )) +
              theme_classic() +
              scale_fill_npg() +
              scale_y_continuous(labels = scales::percent)

a2 <- ggplot(dat_corrected, aes(x= Treatment, y = c__Acidobacteriia, fill = Treatment)) + 
              geom_boxplot() +
              labs(x = "OM removal treatment", y = "Predicted Relative Abundance", title = str_sub("c__Acidobacteriia",4 )) +
              theme_classic() +
              scale_fill_npg() +
              scale_y_continuous(labels = scales::percent)

a3 <- ggplot(dat_corrected, aes(x= Treatment, y = c__Bacilli, fill = Treatment)) + 
              geom_boxplot() +
              labs(x = "OM removal treatment", y = "Predicted Relative Abundance", title = str_sub("c__Bacilli",4 )) +
              theme_classic() +
              scale_fill_npg() +
              scale_y_continuous(labels = scales::percent)

a4 <- ggplot(dat_corrected, aes(x= Treatment, y = c__Gammaproteobacteria, fill = Treatment)) + 
              geom_boxplot() +
              labs(x = "OM removal treatment", y = "Predicted Relative Abundance", title = str_sub("c__Gammaproteobacteria",4 )) +
              theme_classic() +
              scale_fill_npg() +
              scale_y_continuous(labels = scales::percent)

a5 <- ggplot(dat_corrected, aes(x= Treatment, y = c__Actinobacteria, fill = Treatment)) + 
              geom_boxplot() +
              labs(x = "OM removal treatment", y = "Predicted Relative Abundance", title = str_sub("c__Actinobacteria",4 )) +
              theme_classic() +
              scale_fill_npg() +
              scale_y_continuous(labels = scales::percent)

a6 <- ggplot(dat_corrected, aes(x= Treatment, y = c__Ellin6529, fill = Treatment)) + 
              geom_boxplot() +
              labs(x = "OM removal treatment", y = "Predicted Relative Abundance", title = str_sub("c__Ellin6529",4 )) +
              theme_classic() +
              scale_fill_npg() +
              scale_y_continuous(labels = scales::percent)

```


```{r}
# arrange the plos of the relevant taxas
grid.arrange(a1, a2, a3, a4, a5, a6, ncol = 2)
```

# Plot the remaining significant classes
```{r}
x <- gather(dat_corrected, key = "Taxa", value = "Count", filtered_significant_results) %>%
  mutate(Taxa = as.factor(Taxa)) %>%
  ggplot(aes(x= Treatment, y = Count, fill = Treatment)) + 
              geom_boxplot() +
              labs(x = "OM removal treatment", y = "Predicted Relative Abundance") +
              theme_classic() +
              scale_fill_npg() +
              scale_y_continuous(labels = scales::percent) +
              facet_wrap(. ~ Taxa, nrow = 5, ncol = 3, scales = "free")
```


<taxonomic_bar_plot.Rmd>
# Species level classification of 16S experiment

## Start with loading all necessary packages:

```{r}
library(dplyr)
library(tidyr)
library(ggplot2)
library(forcats)
library(tidyverse)
library(RColorBrewer)
library(ggsci)
```

## Import the species data and take a look at it

```{r}
species_data <- read_csv("level-7.csv")
head(species_data)
```


# find the metadata columns
```{r}
metadata_names <- colnames(species_data)[451: length(species_data)]
metadata_names

metadata <- species_data[,451:length(species_data)]

```


# import function to do the relative abundance analysis
```{r}
source("RelativeTaxa.R")
```


# make the relative abundance analysis of the classified ASVs for each sample

Data is grouped and the family taxonomic level (taxa_level = 3)
```{r}
dat <- RelativeData(species_data, merge = TRUE, taxa_level = 3, Metadata = c(451:length(species_data)))
```


# plot the stacked bar plots

Gather the data in a readable format for ggplot2
```{r}
# gether the data in a format that is readable for ggplot2
dat_gather <- dat %>% gather(key = "Class", 
                             value = "Count", 
                              -c(1, metadata_names)) %>% data.frame

```

Ploting formatting variables used to plot taxonomic bar plot 
```{r}
theme1 <- theme(legend.title = element_text(size = 10),
        legend.text = element_text(size = 5), 
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.box = "vertical")

guide1 <- guides(shape = guide_legend(override.aes = list(size = 1)), 
         color = guide_legend(override.aes = list(size = 0.5)),
         fill = guide_legend(ncol = 2))

geom_stacked <- stat_summary(geom = "bar", fun.data = mean_se, position = "fill")

```


```{r}
# Plot the composition in relationship to OM treatment and faceted based on soil horizon

dat_gather$LTSP.Treatment <- factor(dat_gather$LTSP.Treatment, levels = c("REF", "OM1", "OM2", "OM3"))

OM_plot <-
  dat_gather %>%
  filter(Count >= 0.01) %>%
  #filter(Species != "Unassigned") %>%
  ggplot(aes(x = LTSP.Treatment, y = Count, fill = Class)) +
  
  geom_stacked +
  
  facet_grid( ~ Horizon, scales = "free_x" ) +
  
  # Change legend size and the x axis labels to be vertical:
  theme1 +
  guide1 + 
  labs(x = "OM removal Treatment", y= "Relative abundance")
  
OM_plot
```


<RelativeTaxa.R>
## function for cleaning up taxa_barplot .csv file from qiime

# Load important packages 
library(tidyverse)
library(dplyr)
library(forcats)
library(tidyr)


choose_indexes <- function(dat, type = "numeric"){
  indexes <- c()
  n_row <- nrow(dat) # number of rows
  n_col <- ncol(dat) # number of columns
  
  for (i in 1:n_col){
    if (class(unlist(dat[i])) == type){
      indexes <- append(indexes, i)  
    }
  }
  return(indexes)
}

extract_taxa <- function(name, level = 7, nearest_taxa = FALSE) {
  split_data <- strsplit(name, ";") %>% unlist # splits the name based into a vector
  
  if (length(split_data) == 1){
    return(name)}
  element <- split_data[level] # extract last element of the vector which should contain the species classification
  
  if (nearest_taxa == TRUE){
    while (level > 0){
      if (nchar(element) == 3 & grepl(".__", element) == TRUE | element == "__"){
        level <- level - 1
        element <- split_data[level]
        next} 
      else {
        return(element)}
    }
  }
  else if (nearest_taxa == FALSE){
    if (nchar(element) == 3 & grepl(".__", element) == TRUE |element == "__"){
      return("Unassigned")} 
    else {
      return(element)}
  }
  
  return("Unassigned") # if nothing was able to classify!
}

RelativeData <- function(dat, merge = TRUE, taxa_level = 7, nearest_taxa = FALSE, Metadata = c(), TaxaCutoff = 0.005, SampleFilter = FALSE, SampleCutoff = 0,Pseudocount = FALSE){ ####### addd a remove paramenter to take care of metadata
  
  
  ##################### FACTORS #########################################################################
  # Change any character column to factors --> easier to work with and more functionality
  character_indexes <- choose_indexes(dat, type = "character")
  dat[, character_indexes] <- lapply(dat[, character_indexes], as.factor) # change all columns with character value to factors
  
  ##################### Change the column names to the chosen taxa #######################################
  new_column_names <- sapply(colnames(dat), extract_taxa, taxa_level, nearest_taxa)
  names(dat) <- new_column_names
  
  #################### separate the metadata from the data for further analysis #########################
  if (length(Metadata) > 0){
    metadata <- dat[, Metadata]
    if (class(Metadata) == "numeric" | class(Metadata) == "integer"){
      dat <- dat[, -Metadata]
      }
    else if (class(Metadata) == "character" | class(Metadata) == "factor") {
      dat <- dat[, !names(dat) %in% Metadata]
      }
  }
  
  ##################### Merge the columns with same name #################################################
  # Merge columns with the samen name together if merge = TRUE
  #collapse columns that have the same name (sum them together)
  if (merge == TRUE){
    col_n <- ncol(dat)
    character_indexes <- choose_indexes(dat, type = "factor")
    dat2 <- t(dat[, -character_indexes]) 
    aggr <- by(dat2, INDICES= row.names(dat2), FUN= colSums) 
    aggr <- as.data.frame(do.call(cbind,aggr)) 
    dat <- cbind(dat[, character_indexes[1]], aggr, dat[, character_indexes[-1]])
  }
  
  ##################### Find the relative counts ##########################################################
  # find the variable indexes that are numeric 
  numeric_indexes <- choose_indexes(dat)
  
  # calculate total counts and set it as a data frame
  total_counts <- as.data.frame(rowSums(dat[, numeric_indexes]), nm = "total_counts")
  
  # bind the new data frame to the existing one and filter samples if the user selects so!
  if (SampleFilter == TRUE){
    dat <- dat %>% bind_cols(total_counts) %>% filter(!total_counts <= SampleCutoff)} 
  else{
    dat<- bind_cols(dat, total_counts)}
  
  # change 0 values to pseudocount if Pseudocount variable is TRUE
  if (Pseudocount == TRUE){
    dat <- dat %>% mutate_if(is.numeric, ~if_else(. <= 0.000000, 0.0000001, .))
  }
  
  # obtain indexes of only numeric variables
  numeric_indexes <- choose_indexes(dat)
  
  # subset data frame to only include the numeric variables
  dat2 <- dat[, numeric_indexes]
  
  # Find the relative proportion of each sample based on its total_counts
  n_row <- nrow(dat2) # number of rows
  n_col <- ncol(dat2) # number of columns
  
  for (r in 1:n_row){
    for (c in 1:n_col) {
      dat2[r, c] <- dat2[r, c]/dat2[r, n_col]  # this calculates the proportion
    }
  }
  
  #merge this data again into original data frame
  character_indexes <- choose_indexes(dat, "factor")
  dat <- bind_cols(dat[character_indexes[1]], dat2, dat[character_indexes[-1]])

  dat["total_counts"] <- NULL
  
  ##################### Columns with chosen counts#############################################################
  # take out any colums which have 0 counts for every row
  column_sums <- c()
  col_n <- ncol(dat)
  for (i in 1:col_n){
    if (class(unlist(dat[i])) == "numeric"){
      if (colSums(dat[i]) <= TaxaCutoff){
        column_sums <- append(column_sums, i)
      }
    }
  }
  dat[ column_sums] <- NULL
  
  ###################### Bind the metadata again to the processed data ###############################
  #character_indexes <- choose_indexes(metadata, type = "character")
  #metadata <- lapply(metadata[, character_indexes], as.factor) # change all metadata columns to factors if character
  if (length(Metadata) > 0) {
    dat <- bind_cols(dat, metadata)
    }
  
  return(as.data.frame(dat))
  
}