Skip to content

Commit bccc698

Browse files
committed
added mimiciii data extraction files
1 parent a709d56 commit bccc698

File tree

4 files changed

+373
-24
lines changed

4 files changed

+373
-24
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@ For running our model on univariate time series (UWave dataset):
1515
```bash
1616
python univariate_example.py --epochs 1000 --hidden_units 128 --ref_points 128 --batch_size 1024
1717
```
18+
To reproduce the results on MIMIC-III Dataset, first you need to have an access to the dataset which can be requested \[here(https://mimic.physionet.org/gettingstarted/access/)\]. Once your application to access MIMIC has been approved, you can download the data \[here(https://physionet.org/works/MIMICIIIClinicalDatabase/)\]. You can use the scripts available \[here(https://physionet.org/works/MIMICIIIClinicalDatabase/)\] to import the dataset into a database. Once you have created the database, run these scripts sequentially.
19+
```bash
20+
python mimic_data_extraction.py
21+
```
22+
```bash
23+
python multivariate_example.py --epochs 1000 --reference_points 192 --hours_from_adm 48 --batch_size 256 --gpus 4
24+
```
1825

1926
## References
2027
Satya Narayan Shukla and Benjamin Marlin. Interpolation-prediction networks for irregularly sampled time series. In International Conference on Learning Representations, 2019. \[[pdf](https://openreview.net/pdf?id=r1efr3C9Ym)\]

src/mimic_data_extraction.py

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
import pickle
2+
import psycopg2 as py
3+
4+
# Replace this with your mimic iii database details
5+
conn = py.connect(
6+
"dbname = 'mimic' user = 'snshukla' host = 'localhost' port='5432' password = ''")
7+
8+
cur = conn.cursor()
9+
cur.execute("""select hadm_id from admissions""")
10+
list_adm_id = cur.fetchall()
11+
12+
cur.execute("select hadm_id, admission_type, trunc(extract(epoch from " +
13+
"dischtime- admittime)/3600), hospital_expire_flag from admissions")
14+
length_of_stay = cur.fetchall()
15+
pickle.dump(length_of_stay, open('adm_type_los_mortality.p', 'wb'))
16+
17+
# SpO2 - 646, 220277
18+
# HR - 211, 220045
19+
# RR - 618, 615, 220210, 224690
20+
# SBP - 51,442,455,6701,220179,220050
21+
# DBP - 8368,8440,8441,8555,220180,220051
22+
# EtCO2 - 1817, 228640
23+
# Temp(F) - 223761,678
24+
# Temp(C) - 223762,676
25+
# TGCS - 198, 226755, 227013
26+
# CRR - 3348
27+
# Urine Output - 43647, 43053, 43171, 43173, 43333, 43347,
28+
# 43348, 43355, 43365, 43373, 43374, 43379, 43380, 43431,
29+
# 43519, 43522, 43537, 43576, 43583, 43589, 43638, 43654,
30+
# 43811, 43812, 43856, 44706, 45304, 227519,
31+
# FiO2 - 2981, 3420, 3422, 223835,
32+
# Glucose - 807,811,1529,3745,3744,225664,220621,226537
33+
# pH - 780, 860, 1126, 1673, 3839, 4202, 4753, 6003, 220274, 220734, 223830, 228243,
34+
35+
data = []
36+
for id in range(len(list_adm_id)):
37+
print id, list_adm_id[id][0]
38+
vitals = []
39+
40+
# print("Sp02")
41+
cur.execute("select charttime, valuenum from chartevents where hadm_id ="
42+
+ str(list_adm_id[id][0]) + "and (itemid =" + str(646) +
43+
"or itemid =" + str(220277) + ")order by charttime")
44+
vitals.append(cur.fetchall())
45+
46+
# Heart Rate
47+
# print("HR")
48+
cur.execute("select charttime, valuenum from chartevents where hadm_id ="
49+
+ str(list_adm_id[id][0]) + "and (itemid =" + str(211) +
50+
"or itemid =" + str(220045) + ")order by charttime")
51+
vitals.append(cur.fetchall())
52+
53+
# Respiratory Rate
54+
# print("RR")
55+
cur.execute("select charttime, valuenum from chartevents where hadm_id ="
56+
+ str(list_adm_id[id][0]) + "and (itemid =" + str(618) +
57+
"or itemid =" + str(615) + "or itemid =" + str(220210) +
58+
"or itemid =" + str(224690) + ")order by charttime")
59+
vitals.append(cur.fetchall())
60+
61+
# Systolic Blood Pressure
62+
# print("SBP")
63+
cur.execute("select charttime, valuenum from chartevents where hadm_id ="
64+
+ str(list_adm_id[id][0]) + "and (itemid =" + str(51) +
65+
"or itemid =" + str(442) + "or itemid =" + str(455) +
66+
"or itemid =" + str(6701) + "or itemid =" + str(220179) +
67+
"or itemid =" + str(220050) + ")order by charttime")
68+
vitals.append(cur.fetchall())
69+
70+
# Diastolic Blood Pressure
71+
# print("DBP")
72+
cur.execute("select charttime, valuenum from chartevents where hadm_id ="
73+
+ str(list_adm_id[id][0]) + "and (itemid =" + str(8368) +
74+
"or itemid =" + str(8440) + "or itemid =" + str(8441) +
75+
"or itemid =" + str(8555) + "or itemid =" + str(220180) +
76+
"or itemid =" + str(220051) + ")order by charttime")
77+
vitals.append(cur.fetchall())
78+
79+
# End-tidal carbon dioxide
80+
# print("EtC02")
81+
cur.execute("select charttime, valuenum from chartevents where hadm_id ="
82+
+ str(list_adm_id[id][0]) + "and (itemid =" + str(1817) +
83+
"or itemid =" + str(228640) + ")order by charttime")
84+
vitals.append(cur.fetchall())
85+
86+
# Temperature
87+
# print("Temp")
88+
cur.execute("select charttime, valuenum from chartevents where hadm_id ="
89+
+ str(list_adm_id[id][0]) + "and (itemid =" + str(678) +
90+
"or itemid =" + str(223761) + ")order by charttime")
91+
vitals.append(cur.fetchall())
92+
cur.execute("select charttime, valuenum from chartevents where hadm_id ="
93+
+ str(list_adm_id[id][0]) + "and (itemid =" + str(676) +
94+
"or itemid =" + str(223762) + ")order by charttime")
95+
vitals.append(cur.fetchall())
96+
97+
# Total Glasgow coma score
98+
# print("TGCS")
99+
cur.execute("select charttime, valuenum from chartevents where hadm_id ="
100+
+ str(list_adm_id[id][0]) + "and (itemid =" + str(198) +
101+
"or itemid =" + str(226755) + "or itemid =" + str(227013)
102+
+ ")order by charttime")
103+
vitals.append(cur.fetchall())
104+
105+
# Peripheral capillary refill rate
106+
# print("CRR")
107+
cur.execute("select charttime, value from chartevents where hadm_id ="
108+
+ str(list_adm_id[id][0]) + "and itemid =" + str(3348) +
109+
"order by charttime")
110+
vitals.append(cur.fetchall())
111+
cur.execute("select charttime, value from chartevents where hadm_id ="
112+
+ str(list_adm_id[id][0]) + "and (itemid =" + str(115) +
113+
"or itemid = 223951) order by charttime")
114+
vitals.append(cur.fetchall())
115+
cur.execute("select charttime, value from chartevents where hadm_id ="
116+
+ str(list_adm_id[id][0]) + "and (itemid =" + str(8377) +
117+
"or itemid = 224308) order by charttime")
118+
vitals.append(cur.fetchall())
119+
120+
# Urine output 43647, 43053, 43171, 43173, 43333, 43347, 43348, 43355, 43365, 43373, 43374, 43379
121+
# print("UO")
122+
cur.execute("select charttime, VALUE from outputevents where hadm_id ="
123+
+ str(list_adm_id[id][0]) + " and ( itemid = 40405 or itemid =" +
124+
" 40428 or itemid = 41857 or itemid = 42001 or itemid = 42362 or itemid =" +
125+
" 42676 or itemid = 43171 or itemid = 43173 or itemid = 42042 or itemid =" +
126+
" 42068 or itemid = 42111 or itemid = 42119 or itemid = 40715 or itemid =" +
127+
" 40056 or itemid = 40061 or itemid = 40085 or itemid = 40094 or itemid =" +
128+
" 40096 or itemid = 43897 or itemid = 43931 or itemid = 43966 or itemid =" +
129+
" 44080 or itemid = 44103 or itemid = 44132 or itemid = 44237 or itemid =" +
130+
" 43348 or itemid =" +
131+
" 43355 or itemid = 43365 or itemid = 43372 or itemid = 43373 or itemid =" +
132+
" 43374 or itemid = 43379 or itemid = 43380 or itemid = 43431 or itemid =" +
133+
" 43462 or itemid = 43522 or itemid = 44706 or itemid = 44911 or itemid =" +
134+
" 44925 or itemid = 42810 or itemid = 42859 or itemid = 43093 or itemid =" +
135+
" 44325 or itemid = 44506 or itemid = 43856 or itemid = 45304 or itemid =" +
136+
" 46532 or itemid = 46578 or itemid = 46658 or itemid = 46748 or itemid =" +
137+
" 40651 or itemid = 40055 or itemid = 40057 or itemid = 40065 or itemid =" +
138+
" 40069 or itemid = 44752 or itemid = 44824 or itemid = 44837 or itemid =" +
139+
" 43576 or itemid = 43589 or itemid = 43633 or itemid = 43811 or itemid =" +
140+
" 43812 or itemid = 46177 or itemid = 46727 or itemid = 46804 or itemid =" +
141+
" 43987 or itemid = 44051 or itemid = 44253 or itemid = 44278 or itemid =" +
142+
" 46180 or itemid = 45804 or itemid = 45841 or itemid = 45927 or itemid =" +
143+
" 42592 or itemid = 42666 or itemid = 42765 or itemid = 42892 or itemid =" +
144+
" 43053 or itemid = 43057 or itemid = 42130 or itemid = 41922 or itemid =" +
145+
" 40473 or itemid = 43333 or itemid = 43347 or itemid = 44684 or itemid =" +
146+
" 44834 or itemid = 43638 or itemid = 43654 or itemid = 43519 or itemid =" +
147+
" 43537 or itemid = 42366 or itemid = 45991 or itemid = 43583 or itemid =" +
148+
" 43647) order by charttime ")
149+
vitals.append(cur.fetchall())
150+
151+
# Fraction inspired oxygen
152+
# print("Fi02")
153+
cur.execute("select charttime, valuenum from chartevents where hadm_id ="
154+
+ str(list_adm_id[id][0]) + "and (itemid =" + str(2981) +
155+
"or itemid =" + str(3420) + "or itemid =" + str(3422) +
156+
"or itemid =" + str(223835) + ")order by charttime")
157+
vitals.append(cur.fetchall())
158+
159+
# Glucose 807,811,1529,3745,3744,225664,220621,226537
160+
# print("Glucose")
161+
cur.execute("select charttime, valuenum from chartevents where hadm_id ="
162+
+ str(list_adm_id[id][0]) + "and (itemid =" + str(807) +
163+
"or itemid =" + str(811) + "or itemid =" + str(1529) +
164+
"or itemid =" + str(3745) + "or itemid =" + str(3744) +
165+
"or itemid =" + str(225664) + "or itemid =" + str(220621) +
166+
"or itemid =" + str(226537) + ")order by charttime")
167+
vitals.append(cur.fetchall())
168+
169+
# pH 780, 860, 1126, 1673, 3839, 4202, 4753, 6003, 220274, 220734, 223830, 228243,
170+
# print("pH")
171+
cur.execute("select charttime, valuenum from chartevents where hadm_id ="
172+
+ str(list_adm_id[id][0]) + "and (itemid =" + str(780) +
173+
"or itemid =" + str(860) + "or itemid =" + str(1126) +
174+
"or itemid =" + str(1673) + "or itemid =" + str(3839) +
175+
"or itemid =" + str(4202) + "or itemid =" + str(4753) +
176+
"or itemid =" + str(6003) + "and itemid =" + str(220274) +
177+
"or itemid =" + str(220734) + "or itemid =" + str(223830) +
178+
"or itemid =" + str(228243) + ") order by charttime")
179+
vitals.append(cur.fetchall())
180+
data.append(vitals)
181+
pickle.dump(data, open('vitals_records.p', 'wb'))

src/mimic_preprocessing.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
import pickle
2+
import copy
3+
import numpy as np
4+
5+
6+
def load_data():
7+
print 'Loading files ...'
8+
vitals = pickle.load(open('vitals_records.p', 'rb'))
9+
adm_info = pickle.load(
10+
open('adm_type_los_mortality.p', 'rb'))
11+
print 'Loading Done!'
12+
adm_id = [record[0] for record in adm_info]
13+
adm_id_needed = [record[0] for record in adm_info if record[2] >= 48]
14+
15+
vitals_dict = {}
16+
for i in range(len(adm_id)):
17+
vitals_dict[adm_id[i]] = vitals[i]
18+
19+
vitals = [vitals_dict[x] for x in adm_id_needed]
20+
label = [rec[3] for x in adm_id_needed for rec in adm_info if x == rec[0]]
21+
print len(vitals), len(label)
22+
return vitals, label
23+
24+
25+
def trim_los(data, length_of_stay):
26+
num_features = 12 # final features (excluding EtCO2)
27+
max_length = 2881 # maximum length of time stamp
28+
a = np.zeros((len(data), num_features, max_length))
29+
timestamps = []
30+
for i in range(len(data)):
31+
l = []
32+
for elem in data[i][7]:
33+
if elem[1] != None:
34+
# Fahrenheit->Celcius conversion
35+
tup = (elem[0], elem[1]*1.8 + 32)
36+
data[i][6].append(tup)
37+
38+
for elem in data[i][10]:
39+
data[i][9].append(elem)
40+
for elem in data[i][11]:
41+
data[i][9].append(elem)
42+
43+
# removing duplicates and EtCO2
44+
del data[i][5]
45+
del data[i][6]
46+
del data[i][8]
47+
del data[i][8]
48+
49+
# taking union of all time stamps,
50+
# we don't actually need this for our model
51+
for j in range(num_features):
52+
for k in range(len(data[i][j])):
53+
l.append(data[i][j][k][0])
54+
55+
# keeping only unique elements
56+
TS = []
57+
for j in l:
58+
if j not in TS:
59+
TS.append(j)
60+
TS.sort()
61+
62+
# extracting first 48hr data
63+
T = copy.deepcopy(TS)
64+
TS = []
65+
for t in T:
66+
if (t - T[0]).total_seconds()/3600 <= length_of_stay:
67+
TS.append(t)
68+
T = []
69+
timestamps.append(TS)
70+
for j in range(num_features):
71+
c = 0
72+
for k in range(len(TS)):
73+
if c < len(data[i][j]) and TS[k] == data[i][j][c][0]:
74+
if data[i][j][c][1] is None:
75+
a[i, j, k] = -100 # missing data
76+
elif (data[i][j][c][1] == 'Normal <3 secs' or
77+
data[i][j][c][1] == 'Normal <3 Seconds' or
78+
data[i][j][c][1] == 'Brisk'):
79+
a[i, j, k] = 1
80+
elif (data[i][j][c][1] == 'Abnormal >3 secs' or
81+
data[i][j][c][1] == 'Abnormal >3 Seconds' or
82+
data[i][j][c][1] == 'Delayed'):
83+
a[i, j, k] = 2
84+
elif (data[i][j][c][1] == 'Other/Remarks' or
85+
data[i][j][c][1] == 'Comment'):
86+
a[i, j, k] = -100 # missing data
87+
else:
88+
a[i, j, k] = data[i][j][c][1]
89+
90+
c += 1
91+
else:
92+
a[i, j, k] = -100 # missing data
93+
94+
return a, timestamps
95+
96+
97+
def fix_input_format(x, T):
98+
"""Return the input in the proper format
99+
x: observed values
100+
M: masking, 0 indicates missing values
101+
delta: time points of observation
102+
"""
103+
timestamp = 200
104+
num_features = 12
105+
106+
# trim time stamps higher than 200
107+
for i in range(len(T)):
108+
if len(T[i]) > timestamp:
109+
T[i] = T[i][:timestamp]
110+
111+
x = x[:, :, :timestamp]
112+
M = np.zeros_like(x)
113+
delta = np.zeros_like(x)
114+
print(x.shape, len(T))
115+
116+
for t in T:
117+
for i in range(1, len(t)):
118+
t[i] = (t[i] - t[0]).total_seconds()/3600.0
119+
if len(t) != 0:
120+
t[0] = 0
121+
122+
# count outliers and negative values as missing values
123+
# M = 0 indicates missing value
124+
# M = 1 indicates observed value
125+
# now since we have mask variable, we don't need -100
126+
M[x > 500] = 0
127+
x[x > 500] = 0.0
128+
M[x < 0] = 0
129+
x[x < 0] = 0.0
130+
M[x > 0] = 1
131+
132+
for i in range(num_features):
133+
for j in range(x.shape[0]):
134+
for k in range(len(T[j])):
135+
delta[j, i, k] = T[j][k]
136+
137+
return x, M, delta
138+

0 commit comments

Comments
 (0)