|
| 1 | +#!/bin/bash |
| 2 | +# This shell script converts BigQuery .sql files into PostgreSQL .sql files. |
| 3 | + |
| 4 | +# path in which we create the postgres concepts |
| 5 | +TARGET_PATH='../concepts_postgres' |
| 6 | +mkdir -p $TARGET_PATH |
| 7 | + |
| 8 | +# String replacements are necessary for some queries. |
| 9 | + |
| 10 | +# Schema replacement: change `physionet-data.<dataset>.<table>` to just <table> (with no backticks) |
| 11 | +export REGEX_SCHEMA='s/`physionet-data.(mimiciii_clinical|mimiciii_derived|mimiciii_notes).([A-Za-z0-9_-]+)`/\2/g' |
| 12 | +# Note that these queries are very senstive to changes, e.g. adding whitespaces after comma can already change the behavior. |
| 13 | +export REGEX_DATETIME_DIFF="s/DATETIME_DIFF\(([^,]+), ?(.*), ?(DAY|MINUTE|SECOND|HOUR|YEAR)\)/DATETIME_DIFF(\1, \2, '\3')/g" |
| 14 | +export REGEX_DATETIME_TRUNC="s/DATETIME_TRUNC\(([^,]+), ?(DAY|MINUTE|SECOND|HOUR|YEAR)\)/DATE_TRUNC('\2', \1)/g" |
| 15 | +# Add necessary quotes to INTERVAL, e.g. "INTERVAL 5 hour" to "INTERVAL '5' hour" |
| 16 | +export REGEX_INTERVAL="s/interval ([[:digit:]]+) (hour|day|month|year)/INTERVAL '\1' \2/gI" |
| 17 | +# Specific queries for some problems that arose with some files. |
| 18 | +export REGEX_INT="s/CAST\(hr AS INT64\)/CAST\(hr AS bigint\)/g" |
| 19 | +export REGEX_ARRAY="s/GENERATE_ARRAY\(-24, CEIL\(DATETIME\_DIFF\(it\.outtime_hr, it\.intime_hr, HOUR\)\)\)/ARRAY\(SELECT \* FROM generate\_series\(-24, CEIL\(DATETIME\_DIFF\(it\.outtime_hr, it\.intime_hr, HOUR\)\)\)\)/g" |
| 20 | +export REGEX_HOUR_INTERVAL="s/INTERVAL CAST\(hr AS INT64\) HOUR/interval \'1\' hour * CAST\(hr AS bigint\)/g" |
| 21 | +export REGEX_SECONDS="s/SECOND\)/\'SECOND\'\)/g" |
| 22 | + |
| 23 | +# tables we want to run before all other concepts |
| 24 | +# usually because they are used as dependencies |
| 25 | +DIR_AND_TABLES_TO_PREBUILD='demographics.icustay_times demographics.icustay_hours .echo_data .code_status .rrt durations.weight_durations fluid_balance.urine_output organfailure.kdigo_uo' |
| 26 | + |
| 27 | +# tables which are written directly in postgresql and source code controlled |
| 28 | +# this is usually because there is no trivial conversion between bq/psql syntax |
| 29 | +DIR_AND_TABLES_ALREADY_IN_PSQL='demographics.icustay_times demographics.icustay_hours demographics.note_counts diagnosis.ccs_dx' |
| 30 | + |
| 31 | +# tables which we want to run after all other concepts |
| 32 | +# usually because they depend on one or more other queries |
| 33 | +DIR_AND_TABLES_TO_SKIP='' |
| 34 | + |
| 35 | +# First, we re-create the postgres-make-concepts.sql file. |
| 36 | +echo "\echo ''" > $TARGET_PATH/postgres-make-concepts.sql |
| 37 | + |
| 38 | +# Now we add some preamble for the user running the script. |
| 39 | +echo "\echo '==='" >> $TARGET_PATH/postgres-make-concepts.sql |
| 40 | +echo "\echo 'Beginning to create materialized views for MIMIC database.'" >> $TARGET_PATH/postgres-make-concepts.sql |
| 41 | +echo "\echo '"'Any notices of the form "NOTICE: materialized view "XXXXXX" does not exist" can be ignored.'"'" >> $TARGET_PATH/postgres-make-concepts.sql |
| 42 | +echo "\echo 'The scripts drop views before creating them, and these notices indicate nothing existed prior to creating the view.'" >> $TARGET_PATH/postgres-make-concepts.sql |
| 43 | +echo "\echo '==='" >> $TARGET_PATH/postgres-make-concepts.sql |
| 44 | +echo "\echo ''" >> $TARGET_PATH/postgres-make-concepts.sql |
| 45 | + |
| 46 | +# ======================================== # |
| 47 | +# === CONCEPTS WHICH WE MUST RUN FIRST === # |
| 48 | +# ======================================== # |
| 49 | +echo -n "Dependencies:" |
| 50 | + |
| 51 | +# output table creation calls to the make-concepts script |
| 52 | +echo "" >> $TARGET_PATH/postgres-make-concepts.sql |
| 53 | +echo "-- dependencies" >> $TARGET_PATH/postgres-make-concepts.sql |
| 54 | + |
| 55 | +for dir_and_table in $DIR_AND_TABLES_TO_PREBUILD; |
| 56 | +do |
| 57 | + d=`echo ${dir_and_table} | cut -d. -f1` |
| 58 | + tbl=`echo ${dir_and_table} | cut -d. -f2` |
| 59 | + |
| 60 | + if [[ $d == '' ]]; then |
| 61 | + d='.' |
| 62 | + fi |
| 63 | + |
| 64 | + # make the sub-folder for postgres if it does not exist |
| 65 | + mkdir -p "$TARGET_PATH/${d}" |
| 66 | + |
| 67 | + # convert the bigquery script to psql and output it to the appropriate subfolder |
| 68 | + echo -n " ${d}.${tbl} .." |
| 69 | + |
| 70 | + # re-write the script into psql using regex |
| 71 | + # the if statement ensures we do not overwrite tables which are already written in psql |
| 72 | + if ! [[ "$DIR_AND_TABLES_ALREADY_IN_PSQL" =~ "$d.$tbl" ]]; then |
| 73 | + echo "-- THIS SCRIPT IS AUTOMATICALLY GENERATED. DO NOT EDIT IT DIRECTLY." > "${TARGET_PATH}/${d}/${tbl}.sql" |
| 74 | + echo "DROP TABLE IF EXISTS ${tbl}; CREATE TABLE ${tbl} AS " >> "${TARGET_PATH}/${d}/${tbl}.sql" |
| 75 | + cat "${d}/${tbl}.sql" | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_DATETIME_TRUNC}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" >> "${TARGET_PATH}/${d}/${tbl}.sql" |
| 76 | + fi |
| 77 | + |
| 78 | + # write out a call to this script in the make concepts file |
| 79 | + echo "\i ${d}/${tbl}.sql" >> $TARGET_PATH/postgres-make-concepts.sql |
| 80 | +done |
| 81 | +echo " done!" |
| 82 | + |
| 83 | +# ================================== # |
| 84 | +# === MAIN LOOP FOR ALL CONCEPTS === # |
| 85 | +# ================================== # |
| 86 | + |
| 87 | +# Iterate through each concept subfolder, and: |
| 88 | +# (1) apply the above regular expressions to update the script |
| 89 | +# (2) output to the postgres subfolder |
| 90 | +# (3) add a line to the postgres-make-concepts.sql script to generate this table |
| 91 | + |
| 92 | +# organfailure.kdigo_stages firstday.first_day_sofa sepsis.sepsis3 medication.vasoactive_agent medication.norepinephrine_equivalent_dose |
| 93 | + |
| 94 | +# the order *only* matters during the conversion step because our loop is |
| 95 | +# inserting table build commands into the postgres-make-concepts.sql file |
| 96 | +for d in durations comorbidity demographics firstday fluid_balance sepsis diagnosis organfailure severityscores; |
| 97 | +do |
| 98 | + mkdir -p "$TARGET_PATH/${d}" |
| 99 | + echo -n "${d}:" |
| 100 | + echo "" >> $TARGET_PATH/postgres-make-concepts.sql |
| 101 | + echo "-- ${d}" >> $TARGET_PATH/postgres-make-concepts.sql |
| 102 | + for fn in `ls $d`; |
| 103 | + do |
| 104 | + # only run SQL queries |
| 105 | + if [[ "${fn: -4}" == ".sql" ]]; then |
| 106 | + # table name is file name minus extension |
| 107 | + tbl="${fn%????}" |
| 108 | + echo -n " ${tbl} " |
| 109 | + |
| 110 | + if [[ "$DIR_AND_TABLES_TO_PREBUILD" =~ "$d.$tbl" ]]; then |
| 111 | + echo -n "(exists!) .." |
| 112 | + continue |
| 113 | + elif [[ "$DIR_AND_TABLES_TO_SKIP" =~ "$d.$tbl" ]]; then |
| 114 | + echo -n "(skipping!) .." |
| 115 | + continue |
| 116 | + else |
| 117 | + echo -n ".." |
| 118 | + fi |
| 119 | + |
| 120 | + # re-write the script into psql using regex |
| 121 | + # the if statement ensures we do not overwrite tables which are already written in psql |
| 122 | + if ! [[ "$DIR_AND_TABLES_ALREADY_IN_PSQL" =~ "$d.$tbl" ]]; then |
| 123 | + echo "-- THIS SCRIPT IS AUTOMATICALLY GENERATED. DO NOT EDIT IT DIRECTLY." > "${TARGET_PATH}/${d}/${tbl}.sql" |
| 124 | + echo "DROP TABLE IF EXISTS ${tbl}; CREATE TABLE ${tbl} AS " >> "${TARGET_PATH}/${d}/${tbl}.sql" |
| 125 | + cat "${d}/${tbl}.sql" | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_DATETIME_TRUNC}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" >> "${TARGET_PATH}/${d}/${fn}" |
| 126 | + fi |
| 127 | + |
| 128 | + # add statement to generate this table to make concepts script |
| 129 | + echo "\i ${d}/${fn}" >> ${TARGET_PATH}/postgres-make-concepts.sql |
| 130 | + fi |
| 131 | + done |
| 132 | + echo " done!" |
| 133 | +done |
| 134 | + |
| 135 | +# finally generate first_day_sofa which depends on concepts in firstday folder |
| 136 | +echo "" >> ${TARGET_PATH}/postgres-make-concepts.sql |
| 137 | +echo "-- final tables which were dependent on one or more prior tables" >> ${TARGET_PATH}/postgres-make-concepts.sql |
| 138 | + |
| 139 | +echo -n "final:" |
| 140 | +for dir_and_table in $DIR_AND_TABLES_TO_SKIP |
| 141 | +do |
| 142 | + d=`echo ${dir_and_table} | cut -d. -f1` |
| 143 | + tbl=`echo ${dir_and_table} | cut -d. -f2` |
| 144 | + |
| 145 | + # make the sub-folder for postgres if it does not exist |
| 146 | + mkdir -p "$TARGET_PATH/${d}" |
| 147 | + |
| 148 | + # convert the bigquery script to psql and output it to the appropriate subfolder |
| 149 | + echo -n " ${d}.${tbl} .." |
| 150 | + if ! [[ "$DIR_AND_TABLES_ALREADY_IN_PSQL" =~ "$d.$tbl" ]]; then |
| 151 | + echo "-- THIS SCRIPT IS AUTOMATICALLY GENERATED. DO NOT EDIT IT DIRECTLY." > "${TARGET_PATH}/${d}/${tbl}.sql" |
| 152 | + echo "DROP TABLE IF EXISTS ${tbl}; CREATE TABLE ${tbl} AS " >> "${TARGET_PATH}/${d}/${tbl}.sql" |
| 153 | + cat "${d}/${tbl}.sql" | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_DATETIME_TRUNC}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" >> "${TARGET_PATH}/${d}/${fn}" |
| 154 | + fi |
| 155 | + # write out a call to this script in the make concepts file |
| 156 | + echo "\i ${d}/${tbl}.sql" >> $TARGET_PATH/postgres-make-concepts.sql |
| 157 | +done |
| 158 | +echo " done!" |
0 commit comments