January 30, 2025
# Data Sourcing
---
execute:
echo: false
date: "`r Sys.Date()`"
format:
html:
code-fold: true
code-tools: true
code-summary: "Show the code"
---
```{r echo = F, message = F}
## initialization
# clear environment
rm(list=ls())
# load packages
library(tidyverse)
library(lubridate)
library(readr)
library(readxl)
library(writexl)
library(hms)
library(plotly)
library(DT)
library(readxl)
library(openxlsx)
library(leaflet)
library(DT)
library(ggpubr)
library(plotrix)
library(remotes)
library(magrittr)
library(janitor)
library(xfun)
library(cowplot)
#install_github("USGS-R/EGRET")
select <- dplyr::select
# to do: add explanatory content to this qmd document
# - ref to data absence epa hows my waterway, in progress fix
# - ref to total extent of data in epa wqx
# - make sure outliers.xlsx doc can be downloaded & accessed
# - modify such that outliers are labeled and present in the tabel downloads, but not shown in the boxplots
# - confirm outlier removal works, org to prep to eqp wqx sync. WAIT UNTIL ALL DATA SYNCHED TO THIS DOCUMENT BEFORE EXCISING OUTLIERS
```
```{r echo = F}
##### Notes on data sourcing
# Data is sourced from the following queries at <https://waterqualitydata.us> on Feb 24, 2021:
# CSV download for SAMPLE data: <https://www.waterqualitydata.us/portal/#bBox=-151.322501%2C60.274310%2C-149.216144%2C60.738915&mimeType=csv&dataProfile=narrowResult>
# CSV download for STATION data: <https://www.waterqualitydata.us/portal/#countrycode=US&statecode=US%3A02&countycode=US%3A02%3A122&bBox=-151.322501%2C60.274310%2C-149.216144%2C60.738915&mimeType=csv>
# Note: these CSV files are excluded from the GitHub repository because they are too large to sync. To reproduce the analysis, download and save these files locally instead. (See the ReadMe file at data/WQX_downloads in the repository).
# Using these same queries in the future will download the most current csv files that may have received additional data in the interim since the last time this script was run.
# We initially attempted to perform the task described above with the dataRetrieval package, but were not successful. As of 2025 this tool has transitioned to the "TADA" (tools for automated data analysis) package from EPA
#library(dataRetrieval)
# Note: in there future, I hope to automatically retrieve the most current water quality data from the EPA WQX
```
```{r echo = F, warning=FALSE, results='hide',message=FALSE}
# prepare and pre-process data that was downloaded from EPA WQX as described above
# designate download date
download_date <- "20230215"
# import sample data
dat <- read_csv("other/input/WQX_downloads/narrowresult/narrowresult.csv") %>%
# retain only KWF data
filter(OrganizationFormalName == "Kenai Watershed Forum(Volunteer)*") %>%
# remove empty columns
select_if(~!(all(is.na(.))))
# remove extraneous text from "Kenai Watershed Forum(Volunteer)*"
dat %<>%
mutate(OrganizationFormalName = gsub("\\s*\\([^\\)]+\\)","",as.character(dat$OrganizationFormalName))) %>%
mutate(OrganizationFormalName = str_remove(OrganizationFormalName,"\\*"))
# import station data, from seperate file
site_dat <- read_csv("other/input/WQX_downloads/station/station.csv") %>%
# retain only Kenai Baseline sites (contains "KBL")
filter(grepl("KBL", MonitoringLocationName)) %>%
# remove empty columns
select_if(~!(all(is.na(.))))
# remove extraneous text from "Kenai Watershed Forum(Volunteer)*"
site_dat %<>%
mutate(OrganizationFormalName = "Kenai Watershed Forum")
# Retain surface water sites only. Exclude well sampling sites.
# create and apply filter
#surface <- c("River/Stream","Lake","River/Stream Perennial","BEACH Program Site-Ocean","BEACH Program Site-River/Stream","Lake, Reservoir, Impoundment","Stream","Spring")
#site_dat %<>%
# filter(OrganizationFormalName == "Kenai Watershed Forum")
#filter(MonitoringLocationTypeName %in% surface)
# join site and sample data
dat <- left_join(dat,site_dat) %>%
# retain only observations with all site data
filter(!is.na(LatitudeMeasure))
# in the future, use EGRET package (http://usgs-r.github.io/EGRET/articles/EGRET.htmlmaps) for automated database query.
# potentially also useful: https://waterdata.usgs.gov/nwis/inventory?search_criteria=lat_long_bounding_box&submitted_form=introduction
# What organizations in the database contain the word "Kenai"?
# filter if OrganizationFormalName has the term "Kenai"
#kenai_orgs <- dat %>%
# select(OrganizationIdentifier,OrganizationFormalName) %>%
# filter(grepl("Kenai",OrganizationFormalName)) %>%
# distinct()
# What kind of sites are present in our data set?
#site_types <- data.frame(unique(dat$MonitoringLocationTypeName))
```
```{r echo = F, warning=FALSE, results='hide',message=FALSE}
### Miscellaneous additional dataframe prep ###
# specify date format
dat %<>%
transform(ActivityStartDate = date(ActivityStartDate)) %>%
# remove missing observations
filter(!is.na(ResultMeasureValue))
# Remove air temperature observations
dat %<>%
filter(CharacteristicName != "Temperature, air")
# UNITS
# bacteria - fecal coliform should all be "cfu/100ml"
dat <- dat %>%
mutate(`ResultMeasure/MeasureUnitCode` = case_when(
CharacteristicName == "Fecal Coliform" ~ "cfu/100ml",
TRUE ~ `ResultMeasure/MeasureUnitCode`))
# SITE NAME CORRECTIONS
# Create and export table of site names
site_tbl <- dat %>%
group_by(MonitoringLocationName,LatitudeMeasure,LongitudeMeasure) %>%
summarise(min_date = min(ActivityStartDate),
max_date = max(ActivityStartDate)) %>%
rename("latitude" = "LatitudeMeasure",
"longitude" = "LongitudeMeasure")
# export csv of site names as received from EPA WQX
write.csv(site_tbl,"other/output/field_qa_qc_data/2000_2014_sitenames.csv", row.names = F)
# read back in the manually edited table with info on maintsem/trib and river mile
site_tbl <- read_csv("other/input/baseline_sites.csv") %>%
remove_empty() %>%
select(-latitude,-longitude)
# 2/15/2023: having some trouble with this join for unknown reasons.
# So, instead, we can extract trib/mainstem info and river mile from the existing MonitoringLocationName column (KBL_t_00.0 = kenai baseline tributary mile 0)
dat %<>%
separate_wider_delim(MonitoringLocationName, delim = "_", names = c("project","trib_mainstem","river_mile"), cols_remove = F)
# transition column names to "easier" formats
dat %<>% clean_names()
# export a csv with current column names, to provide the column format names for a seperate, manually curated outlier .xlsx file located in the same directory
write.csv(dat,"other/input/outliers/baseline_outliers.csv")
# Create column of tributary names (e.g. Beaver Creek) to associate with tributary river miles
# read in tribuary names
trib_names <- read.csv("other/input/baseline_sites.csv") %>%
select(MonitoringLocationName, tributary_name, tributary_mainstem) %>%
filter(tributary_mainstem == "tributary") %>%
clean_names() %>%
select(-tributary_mainstem)
# join trib names to overall table
dat %<>% left_join(trib_names, by = "monitoring_location_name")
# create general "site name" column. use tributary name if available, and river mile if not
dat %<>%
transform(river_mile = as.character(as.numeric(river_mile))) %>%
mutate(site_name = case_when(
!is.na(tributary_name) ~ tributary_name,
is.na(tributary_name) ~ river_mile))
```
```{r echo = F, warning=FALSE, message=FALSE}
#### SEASONS BOUNDARY ####
### Designate samples as "spring" vs "summer" for each observation ("season")
# choose a "boundary date" between spring and summer (see appendix "Sample Event Timing" for rationale)
# boundary date selected is June 4 (julian day 155)
boundary_date <- 155
dat %<>%
mutate(julian_day = yday(activity_start_date)) %>%
mutate(season = case_when(
julian_day > 155 ~ "Summer",
julian_day < 155 ~ "Spring"
))
```
```{r echo = F, warning=FALSE, message=FALSE}
# OUTLIERS
# exclude outliers with an inner_join function
# see manually identified outliers at "other/input/outliers/baseline_outliers.xlsx"
outliers <- read.xlsx("other/input/outliers/baseline_outliers.xlsx") %>%
select(activity_identifier,characteristic_name)
dat <- anti_join(dat,outliers)
```
```{r echo = F, warning=FALSE, message=FALSE}
# EXPORT PREPROCESSED DATA TO LOCAL PROJECT DIRECTORY
# write finalized data that is output form all the steps above, to be used in ext steps in the scripst that follow
# this export format represents the most granular presentation, nothing has been aggregated yet. The "reg_limits.qmd" script aggregates volatiles, for example
write.csv(dat,"other/output/analysis_format/baseline_export_format.csv",row.names = F)
```
```{r eval = F}
### Exploratory Data Analysis
### Consider including these numbers in section intro
# How many years of data do we have, including from the waterqualitydata.us database AND the KWF local server?
years_tbl <- dat %>%
filter(organization_formal_name == "Kenai Watershed Forum") %>%
summarise(min_date = min(ActivityStartDate),
max_date = max(ActivityStartDate))
# How many different kinds of substances have we measured ?
total_parameters <- dat %>%
filter(OrganizationFormalName == "Kenai Watershed Forum") %>%
distinct(CharacteristicName) %>%
count()
# What are the names of all of substances have we measured ?
parameter_names <- dat %>%
filter(OrganizationFormalName == "Kenai Watershed Forum") %>%
distinct(CharacteristicName)
```