说明:
MIMIC的SQL查询确实够复杂,阻碍了很多想利用数据库做研究的人,也让人对已经做出来的研究的确实性产生怀疑 。
今天是几个官方SQL简单基础的查询,循序渐进, 相信能够更好地掌握。
SELECT column1, column2, ...
FROM table_name
WHERE condition
GROUP BY column1, column2, ...;
其中,column1、column2等是要选择的列的名称,table_name是要选择的表的名称。如果要选择所有列,则可以使用星号(*)代替列名。
-- begin query that extracts the data--本查询的目的是查询两个酶的表达
SELECT
MAX(subject_id) AS subject_id--MAX是从多条记录中取一条,最新的。
, MAX(hadm_id) AS hadm_id
, MAX(charttime) AS charttime
, le.specimen_id-- 索引列,使用GROUPBY
, MAX(CASE WHEN itemid = 51002 THEN value ELSE NULL END) AS troponin_i--这里的MAX一方面是求最大值,另一方面是多条记录转单条记录的操作
, MAX(CASE WHEN itemid = 51003 THEN value ELSE NULL END) AS troponin_t--CASE WHEN 这里是纵向转横向的一个操作,很多地方都见过。
, MAX(CASE WHEN itemid = 50911 THEN valuenum ELSE NULL END) AS ck_mb
FROM mimic_hosp.labevents le
WHERE le.itemid IN
(
-- 51002, -- Troponin I (troponin-I is not measured in MIMIC-IV)
-- 52598, -- Troponin I, point of care, rare/poor quality
51003, -- Troponin T
50911 -- Creatinine Kinase, MB isoenzyme
)
GROUP BY le.specimen_id
--按照每个样本分组,查询结果是specimen_id的值是唯一的,其他列可能有重复值。
--让我想起索引的操作,一系列的数据中是不是总是要有个作为索引呢?
;
SELECT
MAX(subject_id) AS subject_id
, MAX(hadm_id) AS hadm_id
, MAX(charttime) AS charttime
, le.specimen_id-- lab数据是使用这个作为索引,在其它的表中是其它的列
-- convert from itemid into a meaningful column
, MAX(CASE WHEN itemid = 51221 THEN valuenum ELSE NULL END) AS hematocrit
, MAX(CASE WHEN itemid = 51222 THEN valuenum ELSE NULL END) AS hemoglobin
, MAX(CASE WHEN itemid = 51248 THEN valuenum ELSE NULL END) AS mch
, MAX(CASE WHEN itemid = 51249 THEN valuenum ELSE NULL END) AS mchc
, MAX(CASE WHEN itemid = 51250 THEN valuenum ELSE NULL END) AS mcv
, MAX(CASE WHEN itemid = 51265 THEN valuenum ELSE NULL END) AS platelet
, MAX(CASE WHEN itemid = 51279 THEN valuenum ELSE NULL END) AS rbc
, MAX(CASE WHEN itemid = 51277 THEN valuenum ELSE NULL END) AS rdw
, MAX(CASE WHEN itemid = 52159 THEN valuenum ELSE NULL END) AS rdwsd
, MAX(CASE WHEN itemid = 51301 THEN valuenum ELSE NULL END) AS wbc
FROM mimic_hosp.labevents le
--经常用到的表格也就那么几个,hosp模块中的itemid(列)常作为限定条件,
--而valuenum(列)和value(列)常被取出作为分析目标。
WHERE le.itemid IN
(
51221, -- hematocrit
51222, -- hemoglobin
51248, -- MCH
51249, -- MCHC
51250, -- MCV
51265, -- platelets
51279, -- RBC
51277, -- RDW
52159, -- RDW SD
51301 -- WBC
)
AND valuenum IS NOT NULL--非空,去除缺失值的方法
-- lab values cannot be 0 and cannot be negative
AND valuenum > 0 --去除异常值
GROUP BY le.specimen_id--建立数据索引,为什么是这一列呢?lab数据一般是这一列。
;
-- This query pivots the vital signs for the entire patient stay.
-- Vital signs include heart rate, blood pressure, respiration rate, and temperature 这里是生命体征的测量, 求的是平均值
select
ce.subject_id
, ce.stay_id
, ce.charttime
, AVG(case when itemid in (220045) and valuenum > 0 and valuenum < 300 then valuenum else null end) as heart_rate
, AVG(case when itemid in (220179,220050) and valuenum > 0 and valuenum < 400 then valuenum else null end) as sbp
, AVG(case when itemid in (220180,220051) and valuenum > 0 and valuenum < 300 then valuenum else null end) as dbp
, AVG(case when itemid in (220052,220181,225312) and valuenum > 0 and valuenum < 300 then valuenum else null end) as mbp
, AVG(case when itemid = 220179 and valuenum > 0 and valuenum < 400 then valuenum else null end) as sbp_ni
, AVG(case when itemid = 220180 and valuenum > 0 and valuenum < 300 then valuenum else null end) as dbp_ni
, AVG(case when itemid = 220181 and valuenum > 0 and valuenum < 300 then valuenum else null end) as mbp_ni
, AVG(case when itemid in (220210,224690) and valuenum > 0 and valuenum < 70 then valuenum else null end) as resp_rate
, ROUND(
AVG(case when itemid in (223761) and valuenum > 70 and valuenum < 120 then (valuenum-32)/1.8 -- converted to degC in valuenum call
when itemid in (223762) and valuenum > 10 and valuenum < 50 then valuenum else null end)
, 2) as temperature
, MAX(CASE WHEN itemid = 224642 THEN value ELSE NULL END) AS temperature_site--分类
, AVG(case when itemid in (220277) and valuenum > 0 and valuenum <= 100 then valuenum else null end) as spo2--SELECT内去除异常值
, AVG(case when itemid in (225664,220621,226537) and valuenum > 0 then valuenum else null end) as glucose
FROM mimic_icu.chartevents ce --cha
where ce.stay_id IS NOT NULL
and ce.itemid in
(
220045, -- Heart Rate
225309, -- ART BP Systolic
225310, -- ART BP Diastolic
225312, -- ART BP Mean
220050, -- Arterial Blood Pressure systolic
220051, -- Arterial Blood Pressure diastolic
220052, -- Arterial Blood Pressure mean
220179, -- Non Invasive Blood Pressure systolic
220180, -- Non Invasive Blood Pressure diastolic
220181, -- Non Invasive Blood Pressure mean
220210, -- Respiratory Rate
224690, -- Respiratory Rate (Total)
220277, -- SPO2, peripheral
-- GLUCOSE, both lab and fingerstick
225664, -- Glucose finger stick
220621, -- Glucose (serum)
226537, -- Glucose (whole blood)
-- TEMPERATURE
223762, -- "Temperature Celsius"
223761, -- "Temperature Fahrenheit"
224642 -- Temperature Site
-- 226329 -- Blood Temperature CCO (C)
)--这些编码能不能用在我们自建的数据库中? 可以自己制作或者网上寻找itemid的对照表,方便了解itemid中的内容。
group by ce.subject_id, ce.stay_id, ce.charttime--select中没有使用聚合函数的,就使用GROUPBY?
;
SQL计算并生成新的列(临时表的使用)
mimi_derived这个表中是经过预处理的数据,有大用。
-- This query extracts the serum creatinine baselines of adult patients on each hospital admission.
-- The baseline is determined by the following rules:
-- i. if the lowest creatinine value during this admission is normal (<1.1), then use the value
-- ii. if the patient is diagnosed with chronic kidney disease (CKD), then use the lowest creatinine value during the admission, although it may be rather large.
-- iii. Otherwise, we estimate the baseline using the Simplified MDRD Formula:
-- eGFR = 186 × Scr^(-1.154) × Age^(-0.203) × 0.742Female
-- Let eGFR = 75. Scr = [ 75 / 186 / Age^(-0.203) / (0.742Female) ] ^ (1/-1.154)
WITH p as
(
SELECT
ag.subject_id
, ag.hadm_id
, ag.age
, p.gender
, CASE WHEN p.gender='F' THEN
POWER(75.0 / 186.0 / POWER(ag.age, -0.203) / 0.742, -1/1.154)
ELSE
POWER(75.0 / 186.0 / POWER(ag.age, -0.203), -1/1.154)
END
AS MDRD_est
FROM `physionet-data.mimic_derived.age` ag
LEFT JOIN `physionet-data.mimic_core.patients` p
ON ag.subject_id = p.subject_id
WHERE ag.age >= 18
)
, lab as
(
SELECT
hadm_id
, MIN(creatinine) AS scr_min
FROM `physionet-data.mimic_derived.chemistry`
GROUP BY hadm_id
)
, ckd as --建立一个临时表, 用诊断限定查询的范围
(
SELECT hadm_id, MAX(1) AS CKD_flag
FROM mimic_hosp.diagnoses_icd
--另外一个常用的表格,其中的icd_code和icd_version常结合使用。
WHERE
(
SUBSTR(icd_code, 1, 3) = '585'
--String数据的处理
AND
icd_version = 9
)
OR
(
SUBSTR(icd_code, 1, 3) = 'N18'
AND
icd_version = 10
)
GROUP BY 1-- 1这里指第一列
)
--主查询的功能是罗列查询的列,连接各个表格,较复杂的都在临时表处理好。
SELECT
p.hadm_id
, p.gender
, p.age
, lab.scr_min
, COALESCE(ckd.ckd_flag, 0) AS ckd
, p.MDRD_est
, CASE
WHEN lab.scr_min<=1.1 THEN scr_min
WHEN ckd.ckd_flag=1 THEN scr_min
ELSE MDRD_est END AS scr_baseline
FROM p
LEFT JOIN lab
ON p.hadm_id = lab.hadm_id
LEFT JOIN ckd
ON p.hadm_id = ckd.hadm_id
;
5.单位转换:一列中有不同来源的数据(单位不同),分开处理,再合并,注意列中是否有单位不一致的情况。
并去除超范围的值。
icu模块中重要的表格叫做chartevents,其中的itemid(列)和valuenum是最重要的列。
-- prep height
WITH ht_in AS
(
SELECT
c.subject_id, c.stay_id, c.charttime
-- Ensure that all heights are in centimeters 单位的一个转换,为了后续的合并
, ROUND(c.valuenum * 2.54, 2) AS height
, c.valuenum as height_orig
FROM `physionet-data.mimic_icu.chartevents` c
WHERE c.valuenum IS NOT NULL
-- Height (measured in inches)
AND c.itemid = 226707
)
, ht_cm AS--两个临时表用逗号分隔
(
SELECT
c.subject_id, c.stay_id, c.charttime
-- Ensure that all heights are in centimeters
, ROUND(c.valuenum, 2) AS height
FROM `physionet-data.mimic_icu.chartevents` c
WHERE c.valuenum IS NOT NULL
-- Height cm
AND c.itemid = 226730
)
-- merge cm/height, only take 1 value per charted row
, ht_stg0 AS
(
SELECT
COALESCE(h1.subject_id, h1.subject_id) as subject_id
--coalesce()的英文意思是合并的作用是:返回传入的参数中第一个非null的值
, COALESCE(h1.stay_id, h1.stay_id) AS stay_id
, COALESCE(h1.charttime, h1.charttime) AS charttime
, COALESCE(h1.height, h2.height) as height
FROM ht_cm h1
FULL OUTER JOIN ht_in h2
ON h1.subject_id = h2.subject_id
AND h1.charttime = h2.charttime
)
--主查询
SELECT subject_id, stay_id, charttime, height
FROM ht_stg0
WHERE height IS NOT NULL--去掉缺失值
-- filter out bad heights 过滤有可能出错的数据
AND height > 120 AND height < 230;
select
stay_id
, charttime
, sum(urineoutput) as urineoutput
from
(
select
-- patient identifiers
oe.stay_id
, oe.charttime
-- volumes associated with urine output ITEMIDs
-- note we consider input of GU irrigant as a negative volume
-- GU irrigant volume in usually has a corresponding volume out
-- so the net is often 0, despite large irrigant volumes
, case
when oe.itemid = 227488 and oe.value > 0 then -1*oe.value
else oe.value
end as urineoutput
from `physionet-data.mimic_icu.outputevents` oe
where itemid in
(
226559, -- Foley
226560, -- Void
226561, -- Condom Cath
226584, -- Ileoconduit
226563, -- Suprapubic
226564, -- R Nephrostomy
226565, -- L Nephrostomy
226567, -- Straight Cath
226557, -- R Ureteral Stent
226558, -- L Ureteral Stent
227488, -- GU Irrigant Volume In
227489 -- GU Irrigant/Urine Volume Out
)
) uo
group by stay_id, charttime--outputevents的索引列
;
-- For reference, some common unit conversions:
-- 10^9/L == K/uL == 10^3/uL
WITH blood_diff AS
(
SELECT
MAX(subject_id) AS subject_id--分类变量都用MAX
, MAX(hadm_id) AS hadm_id
, MAX(charttime) AS charttime
, le.specimen_id-- lab通常使用这个作为索引
-- create one set of columns for percentages, and one set of columns for counts
-- we harmonize all count units into K/uL == 10^9/L
-- counts have an "_abs" suffix, percentages do not
-- absolute counts
, MAX(CASE WHEN itemid in (51300, 51301, 51755) THEN valuenum ELSE NULL END) AS wbc
, MAX(CASE WHEN itemid = 52069 THEN valuenum ELSE NULL END) AS basophils_abs
-- 52073 in K/uL, 51199 in #/uL
, MAX(CASE WHEN itemid = 52073 THEN valuenum WHEN itemid = 51199 THEN valuenum / 1000.0 ELSE NULL END) AS eosinophils_abs
-- 51133 in K/uL, 52769 in #/uL
, MAX(CASE WHEN itemid = 51133 THEN valuenum WHEN itemid = 52769 THEN valuenum / 1000.0 ELSE NULL END) AS lymphocytes_abs
-- 52074 in K/uL, 51253 in #/uL
, MAX(CASE WHEN itemid = 52074 THEN valuenum WHEN itemid = 51253 THEN valuenum / 1000.0 ELSE NULL END) AS monocytes_abs
, MAX(CASE WHEN itemid = 52075 THEN valuenum ELSE NULL END) AS neutrophils_abs
-- convert from #/uL to K/uL
, MAX(CASE WHEN itemid = 51218 THEN valuenum / 1000.0 ELSE NULL END) AS granulocytes_abs
-- percentages, equal to cell count / white blood cell count
, MAX(CASE WHEN itemid = 51146 THEN valuenum ELSE NULL END) AS basophils
, MAX(CASE WHEN itemid = 51200 THEN valuenum ELSE NULL END) AS eosinophils
, MAX(CASE WHEN itemid in (51244, 51245) THEN valuenum ELSE NULL END) AS lymphocytes
, MAX(CASE WHEN itemid = 51254 THEN valuenum ELSE NULL END) AS monocytes
, MAX(CASE WHEN itemid = 51256 THEN valuenum ELSE NULL END) AS neutrophils
-- other cell count percentages
, MAX(CASE WHEN itemid = 51143 THEN valuenum ELSE NULL END) AS atypical_lymphocytes
, MAX(CASE WHEN itemid = 51144 THEN valuenum ELSE NULL END) AS bands
, MAX(CASE WHEN itemid = 52135 THEN valuenum ELSE NULL END) AS immature_granulocytes
, MAX(CASE WHEN itemid = 51251 THEN valuenum ELSE NULL END) AS metamyelocytes
, MAX(CASE WHEN itemid = 51257 THEN valuenum ELSE NULL END) AS nrbc
-- utility flags which determine whether imputation is possible
, CASE
-- WBC is available
WHEN MAX(CASE WHEN itemid in (51300, 51301, 51755) THEN valuenum ELSE NULL END) > 0
-- and we have at least one percentage from the diff
-- sometimes the entire diff is 0%, which looks like bad data
AND SUM(CASE WHEN itemid IN (51146, 51200, 51244, 51245, 51254, 51256) THEN valuenum ELSE NULL END) > 0
THEN 1 ELSE 0 END AS impute_abs
FROM mimic_hosp.labevents le
WHERE le.itemid IN
(
51146, -- basophils
52069, -- Absolute basophil count
51199, -- Eosinophil Count
51200, -- Eosinophils
52073, -- Absolute Eosinophil count
51244, -- Lymphocytes
51245, -- Lymphocytes, Percent
51133, -- Absolute Lymphocyte Count
52769, -- Absolute Lymphocyte Count
51253, -- Monocyte Count
51254, -- Monocytes
52074, -- Absolute Monocyte Count
51256, -- Neutrophils
52075, -- Absolute Neutrophil Count
51143, -- Atypical lymphocytes
51144, -- Bands (%)
51218, -- Granulocyte Count
52135, -- Immature granulocytes (%)
51251, -- Metamyelocytes
51257, -- Nucleated Red Cells
-- wbc totals measured in K/uL
51300, 51301, 51755
-- 52220 (wbcp) is percentage
-- below are point of care tests which are extremely infrequent and usually low quality
-- 51697, -- Neutrophils (mmol/L)
-- below itemid do not have data as of MIMIC-IV v1.0
-- 51536, -- Absolute Lymphocyte Count
-- 51537, -- Absolute Neutrophil
-- 51690, -- Lymphocytes
-- 52151, -- NRBC
)
AND valuenum IS NOT NULL
-- differential values cannot be negative
AND valuenum >= 0
GROUP BY le.specimen_id
)
--以下是主查询
SELECT
subject_id, hadm_id, charttime, specimen_id
, wbc
-- impute absolute count if percentage & WBC is available
, ROUND(CASE
WHEN basophils_abs IS NULL AND basophils IS NOT NULL AND impute_abs = 1
THEN basophils * wbc
ELSE basophils_abs
END, 4) AS basophils_abs
, ROUND(CASE
WHEN eosinophils_abs IS NULL AND eosinophils IS NOT NULL AND impute_abs = 1
THEN eosinophils * wbc
ELSE eosinophils_abs
END, 4) AS eosinophils_abs
, ROUND(CASE
WHEN lymphocytes_abs IS NULL AND lymphocytes IS NOT NULL AND impute_abs = 1
THEN lymphocytes * wbc
ELSE lymphocytes_abs
END, 4) AS lymphocytes_abs
, ROUND(CASE
WHEN monocytes_abs IS NULL AND monocytes IS NOT NULL AND impute_abs = 1
THEN monocytes * wbc
ELSE monocytes_abs
END, 4) AS monocytes_abs
, ROUND(CASE
WHEN neutrophils_abs IS NULL AND neutrophils IS NOT NULL AND impute_abs = 1
THEN neutrophils * wbc
ELSE neutrophils_abs
END, 4) AS neutrophils_abs
, basophils
, eosinophils
, lymphocytes
, monocytes
, neutrophils
-- impute bands/blasts?
, atypical_lymphocytes
, bands
, immature_granulocytes
, metamyelocytes
, nrbc
FROM blood_diff
;
从简单查询中,可以学到以下几点: