Execute a SQL query on the All of Us database

Description

Executes an SQL query on the All of Us database

Usage

aou_sql(
  query,
  collect = FALSE,
  debug = FALSE,
  ...,
  con = getOption("aou.default.con"),
  CDR = getOption("aou.default.cdr")
)

Arguments

query A SQL query (BigQuery dialect) to be executed. Interpreted with glue::glue(), so expressions enclosed with braces will be evaluated. References to “{CDR}” or “{cdr}” will be evaluated automatically (see examples).
collect Whether to bring the resulting table into local memory (collect = TRUE) as a dataframe or leave as a reference to a database table (for continued analysis using, e.g., dbplyr). Defaults to FALSE.
debug Print the query to the console; useful for debugging.
All other arguments passed to bigrquery::bq_table_download() if collect = TRUE.
con Connection to the allofus SQL database. Defaults to getOption(“aou.default.con”), which is created automatically with aou_connect(). Only needed if collect = FALSE.
CDR The name of the "curated data repository" that will be used in any references of the form “{CDR}” or “{cdr}” in the query (see examples). Defaults to getOption(“aou.default.cdr”), which is Sys.getenv(‘WORKSPACE_CDR’) if not specified otherwise (i.e., the "mainline" CDR). On the controlled tier, specify the "base" CDR with CDR = paste0(Sys.getenv(‘WORKSPACE_CDR’), “_base”).

Value

A dataframe if collect = TRUE; a reference to a remote database table if not.

Examples

library(allofus)


# Examples based on AoU snippets
aou_sql("
  -- Compute the count of unique participants in our All of Us cohort.
  SELECT
  COUNT(DISTINCT person_id) AS total_number_of_participants
  FROM
  `{CDR}.person`
", collect = TRUE)

MEASUREMENT_OF_INTEREST <- "hemoglobin"
aou_sql('
-- Compute summary information for our measurements of interest for our cohort.
--
-- PARAMETERS:
--   MEASUREMENT_OF_INTEREST: a case-insensitive string, such as "hemoglobin", to be compared
--                            to all measurement concept names to identify those of interest

WITH
  --
  -- Use a case insensitive string to search the measurement concept names of those
  -- measurements we do have in the measurements table.
  --
  labs_of_interest AS (
  SELECT
    measurement_concept_id,
    measurement_concept.concept_name AS measurement_name,
    unit_concept_id,
    unit_concept.concept_name AS unit_name
  FROM
    `{CDR}.measurement`
  LEFT JOIN `{CDR}.concept` AS measurement_concept
  ON measurement_concept.concept_id = measurement_concept_id
  LEFT JOIN `{CDR}.concept` AS unit_concept
  ON unit_concept.concept_id = unit_concept_id
  WHERE
    REGEXP_CONTAINS(measurement_concept.concept_name, r"(?i){MEASUREMENT_OF_INTEREST}")
  GROUP BY
    measurement_concept_id,
    unit_concept_id,
    measurement_concept.concept_name,
    unit_concept.concept_name
)
  --
  -- Summarize the information about each measurement concept of interest that our
  -- prior query identified.
  --
SELECT
  measurement_name AS measurement,
  IFNULL(unit_name, "NA") AS unit,
  COUNT(1) AS N,
  COUNTIF(value_as_number IS NULL
    AND (value_as_concept_id IS NULL
      OR value_as_concept_id = 0)) AS missing,
  MIN(value_as_number) AS min,
  MAX(value_as_number) AS max,
  AVG(value_as_number) AS avg,
  STDDEV(value_as_number) AS stddev,
  APPROX_QUANTILES(value_as_number, 4) AS quantiles,
  COUNTIF(value_as_number IS NOT NULL) AS num_numeric_values,
  COUNTIF(value_as_concept_id IS NOT NULL
      AND value_as_concept_id != 0) AS num_concept_values,
  COUNTIF(operator_concept_id IS NOT NULL) AS num_operators,
  IF(src_id = "PPI/PM", "PPI", "EHR") AS measurement_source,
  measurement_concept_id,
  unit_concept_id
FROM
  `{CDR}.measurement`
INNER JOIN
 labs_of_interest USING(measurement_concept_id, unit_concept_id)
LEFT JOIN
  `{CDR}.measurement_ext` USING(measurement_id)
GROUP BY
  measurement_concept_id,
  measurement_name,
  measurement_source,
  unit_concept_id,
  unit_name
ORDER BY
  N DESC
', collect = TRUE)