Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
5bdaa76
Create ehr_foundation_model task
will-pang Feb 11, 2026
5a49feb
Add example for testing
will-pang Feb 11, 2026
9777311
Update ehr_foundational_model_mimic4.py
will-pang Feb 11, 2026
6ecf289
Merge remote-tracking branch 'upstream/master' into FoundationalEHR/w…
will-pang Feb 12, 2026
60641c0
Update ehr_foundational_model_mimic4.py
will-pang Feb 12, 2026
e04f54e
Update ehr_foundational_model_mimic4.py
will-pang Feb 12, 2026
13e46f0
Add handling of missing notes
will-pang Feb 12, 2026
f456e53
Update ehr_foundational_model_mimic4.py
will-pang Feb 12, 2026
9eea8fe
update comments
will-pang Feb 12, 2026
eafd929
update comments
will-pang Feb 12, 2026
fe53e89
Update tuple_time_text_processor.py
will-pang Feb 12, 2026
0e1df77
Update multimodal_task.py
will-pang Feb 12, 2026
3d30dbf
Update tuple_time_text_processor.py
will-pang Feb 15, 2026
20306e3
Update ehr_foundational_model_mimic4.py
will-pang Feb 15, 2026
24d1e7b
Update tuple_time_text_processor.py
will-pang Feb 15, 2026
4fab928
Update tuple_time_text_processor.py
will-pang Feb 15, 2026
bdf00e0
Update comments
will-pang Feb 16, 2026
06cdd1e
Update ehr_foundational_model_mimic4.py
will-pang Feb 16, 2026
a1b2756
Update tuple_time_text_processor.py
will-pang Feb 16, 2026
adf3a53
Minor update in docs
will-pang Feb 16, 2026
aa1b0ed
Create test_ehr_foundational_model_mimic4.py
will-pang Feb 16, 2026
3edf07e
Add unit test
will-pang Feb 16, 2026
2413fd5
Update naming
will-pang Feb 18, 2026
a353f60
Update ehr_foundational_model_mimic4.py
will-pang Feb 18, 2026
12e968d
Remove comments
will-pang Feb 18, 2026
b110a2e
Update ehr_foundational_model_mimic4.py
will-pang Feb 19, 2026
f886cdd
Delete test_ehr_foundational_model_mimic4.py
will-pang Feb 19, 2026
f1b74f2
Renaming updates
will-pang Feb 19, 2026
6fc7bd0
Update ehr_foundational_model_mimic4.py
will-pang Feb 19, 2026
628daa5
Merge branch 'sunlabuiuc:master' into FoundationalEHR/wp-create-multi…
will-pang Feb 19, 2026
0584779
Merge branch 'sunlabuiuc:master' into FoundationalEHR/wp-multimodal-t…
will-pang Feb 19, 2026
045e173
Merge branch 'FoundationalEHR/wp-multimodal-task-lab-events-icd-codes…
will-pang Feb 19, 2026
fe764ac
Update ehr_foundational_model_mimic4.py
will-pang Feb 19, 2026
53761d6
Update ehr_foundational_model_mimic4.py
will-pang Feb 19, 2026
08c0240
Merge pull request #1 from will-pang/FoundationalEHR/wp-create-multim…
will-pang Feb 19, 2026
2957aa9
Add lab events and icd 10 codes
will-pang Feb 19, 2026
b8221aa
Update ehr_foundational_model_mimic4.py
will-pang Feb 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions examples/foundation_ehr/multimodal_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from datetime import datetime
from typing import Any, Dict, List, Optional
import os

# PyHealth Packages
from pyhealth.datasets import MIMIC4Dataset
from pyhealth.tasks.ehr_foundational_model_mimic4 import EHRFoundationalModelMIMIC4
from pyhealth.tasks.base_task import BaseTask

# Load MIMIC4 Files
# There's probably better ways dealing with this on the cluster, but working locally for now
# (see: https://github.com/sunlabuiuc/PyHealth/blob/master/examples/mortality_prediction/multimodal_mimic4_minimal.py)

PYHEALTH_REPO_ROOT = '/Users/wpang/Desktop/PyHealth'

EHR_ROOT = os.path.join(PYHEALTH_REPO_ROOT, "srv/local/data/physionet.org/files/mimiciv/2.2")
NOTE_ROOT = os.path.join(PYHEALTH_REPO_ROOT, "srv/local/data/physionet.org/files/mimic-iv-note/2.2")
CXR_ROOT = os.path.join(PYHEALTH_REPO_ROOT,"srv/local/data/physionet.org/files/mimic-cxr-jpg/2.0.0")
CACHE_DIR = os.path.join(PYHEALTH_REPO_ROOT,"srv/local/data/wp/pyhealth_cache")

if __name__ == "__main__":

dataset = MIMIC4Dataset(
ehr_root=EHR_ROOT,
note_root=NOTE_ROOT,
ehr_tables=["diagnoses_icd", "procedures_icd", "prescriptions", "labevents"],
note_tables=["discharge", "radiology"],
cache_dir=CACHE_DIR,
num_workers=8,
dev=True
)

# Apply multimodal task
task = EHRFoundationalModelMIMIC4()
samples = dataset.set_task(task)

# Get and print sample
sample = samples[0]
print(sample)
75 changes: 39 additions & 36 deletions pyhealth/processors/tuple_time_text_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,99 +8,102 @@
Input: Tuple[List[str], List[float]]
- List[str]: Clinical text entries (e.g., discharge notes, progress notes)
- List[float]: Time differences between entries (in any time unit)
Output: Tuple[List[str], torch.Tensor, str]
- List[str]: Same text entries (unmodified)
- torch.Tensor: 1D float tensor of time differences

Output: Tuple[torch.Tensor, torch.Tensor, str]
- torch.Tensor: Text Token IDs from tokenizer [shape: (num_texts, max_seq_len)]
- torch.Tensor: 1D float tensor of time differences [shape: (N,)]
- str: Type tag for automatic modality routing (default: "note")

Use Case:
This processor enables automatic modality bucketing in multimodal pipelines.
The type_tag allows downstream models to automatically route different feature
types to appropriate encoders without hardcoding feature names:

- type_tag="note" routes to text encoder
- type_tag="image" routes to vision encoder
- type_tag="ehr" routes to EHR encoder

This design eliminates the need to manually map task schema feature_keys to
specific model components.

Example:
>>> from pyhealth.processors import TupleTimeTextProcessor
>>> processor = TupleTimeTextProcessor(type_tag="note")
>>>
>>> processor = TupleTimeTextProcessor(type_tag="note", tokenizer_name="dmis-lab/biobert-base-cased-v1.1")
>>>
>>> # Clinical notes with time differences
>>> texts = [
... "Patient admitted with chest pain.",
... "Follow-up: symptoms improved.",
... "Discharge: stable condition."
... ]
>>> time_diffs = [0.0, 2.5, 5.0] # hours since admission
>>>
>>>
>>> result = processor.process((texts, time_diffs))
>>> texts_out, time_tensor, tag = result
>>> print(f"Texts: {texts_out}")
>>> token_ids, time_tensor, tag = result
>>> print(f"Text Token IDs shape: {token_ids.shape}")
>>> print(f"Time tensor: {time_tensor}")
>>> print(f"Type tag: {tag}")

Args:
type_tag (str): Modality identifier for automatic routing in multimodal
models. Common values: "note", "image", "ehr", "signal".
Default: "note"
tokenizer_name (str): HuggingFace model name for the tokenizer.
Default: "dmis-lab/biobert-base-cased-v1.1"
"""

from typing import Any, List, Tuple
from typing import Any, Dict, List, Tuple
import torch
from transformers import AutoTokenizer
from .base_processor import FeatureProcessor
from . import register_processor


@register_processor("tuple_time_text")
class TupleTimeTextProcessor(FeatureProcessor):
"""Processes (text, time_diff) tuples for multimodal temporal fusion.
Converts paired text and temporal data into a format suitable for models
that need to distinguish between different modality types automatically.

Tokenizes text entries using a HuggingFace tokenizer and converts
temporal data into tensors for downstream model consumption.
"""
def __init__(self, type_tag: str = "note"):

def __init__(self, type_tag: str = "note", tokenizer_name: str = "dmis-lab/biobert-base-cased-v1.1"):
"""Initialize the processor.

Args:
type_tag: Modality identifier for automatic routing. Default: "note"
tokenizer_name: HuggingFace model name for the tokenizer.
Default: "dmis-lab/biobert-base-cased-v1.1"
"""
super().__init__()
self.type_tag = type_tag
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

def process(self, value: Tuple[List[str], List[float]]) -> Tuple[List[str], torch.Tensor, str]:
def process(self, value: Tuple[List[str], List[float]]) -> Tuple[Any, Any, str]:
"""Process a tuple of texts and time differences.


Tokenizes the text entries using the HuggingFace tokenizer and
converts time differences to a float tensor.

Args:
value: Tuple containing:
- List[str]: Text entries (clinical notes, observations, etc.)
- List[float]: Time differences corresponding to each text entry

Returns:
Tuple containing:
- List[str]: Original text entries (unmodified)
- torch.Tensor: Text Token IDs [shape: (T: num_texts, L: max_token_len)]
- torch.Tensor: 1D float tensor of time differences [shape: (N,)]
- str: Type tag for modality routing

Example:
>>> processor = TupleTimeTextProcessor(type_tag="clinical_note")
>>> texts = ["Note 1", "Note 2"]
>>> times = [0.0, 24.0] # hours
>>> result = processor.process((texts, times))
>>> print(result[1]) # tensor([0., 24.])
"""
texts, time_diffs = value
text_token_ids = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")["input_ids"]
time_tensor = torch.tensor(time_diffs, dtype=torch.float32)
return texts, time_tensor, self.type_tag
return text_token_ids, time_tensor, self.type_tag

def size(self):
"""Return the size of the processor vocabulary (not applicable for this processor)."""
return None
"""Return the vocabulary size of the tokenizer."""
return self.tokenizer.vocab_size

def __repr__(self):
return f"TupleTimeTextProcessor(type_tag='{self.type_tag}')"
return f"TupleTimeTextProcessor(type_tag='{self.type_tag}', tokenizer='{self.tokenizer.name_or_path}')"
Loading