Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ Available Datasets
datasets/pyhealth.datasets.EHRShotDataset
datasets/pyhealth.datasets.Support2Dataset
datasets/pyhealth.datasets.BMDHSDataset
datasets/pyhealth.datasets.CaReSoundDataset
datasets/pyhealth.datasets.COVID19CXRDataset
datasets/pyhealth.datasets.ChestXray14Dataset
datasets/pyhealth.datasets.TUABDataset
Expand Down
11 changes: 11 additions & 0 deletions docs/api/datasets/pyhealth.datasets.CaReSoundDataset.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
pyhealth.datasets.CaReSoundDataset
================================

The CaReSound dataset provides question and answer pairs for medical sounds. For more information see `CaReSound <https://huggingface.co/datasets/tsnngw/CaReSound/>`_. This dataset was contributed as part of the CaReAQA: A Cardiac and Respiratory Audio Question Answering Model for Open-Ended Diagnostic Reasoning work (`arXiv:2505.01199 <https://arxiv.org/abs/2505.01199>`_).

.. autoclass:: pyhealth.datasets.CaReSoundDataset
:members:
:undoc-members:
:show-inheritance:


1 change: 1 addition & 0 deletions docs/api/tasks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -229,3 +229,4 @@ Available Tasks
Mutation Pathogenicity (COSMIC) <tasks/pyhealth.tasks.MutationPathogenicityPrediction>
Cancer Survival Prediction (TCGA) <tasks/pyhealth.tasks.CancerSurvivalPrediction>
Cancer Mutation Burden (TCGA) <tasks/pyhealth.tasks.CancerMutationBurden>
CaReSound <tasks/pyhealth.tasks.CaReSoundAQA>
7 changes: 7 additions & 0 deletions docs/api/tasks/pyhealth.tasks.CaReSoundAQA
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
pyhealth.tasks.CaReSoundAQA
==============================================

.. autoclass:: pyhealth.tasks.CaReSoundAQA
:members:
:undoc-members:
:show-inheritance:
1 change: 1 addition & 0 deletions pyhealth/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(self, *args, **kwargs):

from .base_dataset import BaseDataset
from .cardiology import CardiologyDataset
from .caresound import CaReSoundDataset
from .chestxray14 import ChestXray14Dataset
from .clinvar import ClinVarDataset
from .cosmic import COSMICDataset
Expand Down
217 changes: 217 additions & 0 deletions pyhealth/datasets/caresound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
"""CaReSound dataset for PyHealth.

This module provides the CaReSoundDataset class for loading and processing
the CaReSound benchmark data for Audio Question Answering (AQA) tasks.
"""
import logging
import os
from pathlib import Path
from typing import List, Optional, Dict, Any

import pandas as pd
from .base_dataset import BaseDataset

logger = logging.getLogger(__name__)

class CaReSoundDataset(BaseDataset):
"""CaReSound dataset for open-ended diagnostic reasoning.

This dataset aggregates five medical audio sources: ICBHI, KAUH,
CirCor, SPRSound, and ZCHSound. It pairs respiratory and cardiac
audio with 34,792 GPT-4o generated Question-Answer pairs.

Args:
root: Root directory containing the audio files (.wav) and/or CSVs.
tables: Optional list of tables to load. Defaults to ["metadata"].
dataset_name: Optional name of the dataset. Defaults to "caresound".
config_path: Optional path to the configuration file.
"""

def __init__(
self,
root: str,
tables: List[str] = None,
dataset_name: Optional[str] = None,
config_path: Optional[str] = None,
**kwargs,
) -> None:
if config_path is None:
logger.info("No config path provided, using default config")
config_path = Path(__file__).parent / "configs" / "caresound.yaml"

# 1. Prepare standardized CSV (handles all local/API edge cases)
pyhealth_csv = os.path.join(root, "caresound_metadata.csv")
if not os.path.exists(pyhealth_csv):
self.prepare_metadata(root)

# 2. Resolve local audio paths dynamically
self.audio_path_map = self._resolve_audio_paths(root)

# 3. Define the default table mapped in the YAML
default_tables = ["metadata"]
tables = default_tables + (tables or [])

super().__init__(
root=root,
tables=tables,
dataset_name=dataset_name or "caresound",
config_path=config_path,
**kwargs,
)

@staticmethod
def prepare_metadata(root: str) -> None:
"""Prepares QA metadata from local ZIP/CSV drops or downloads via HF API."""
output_path = os.path.join(root, "caresound_metadata.csv")

train_csv = os.path.join(root, "CaReSoundQA_train.csv")
test_csv = os.path.join(root, "CaReSoundQA_test.csv")
full_csv = os.path.join(root, "CaReSoundQA.csv")

df_master = None

# Scenario A: User manually downloaded both Train and Test CSVs
if os.path.exists(train_csv) and os.path.exists(test_csv):
logger.info("Found local train/test CSVs. Merging...")
df_train, df_test = pd.read_csv(train_csv), pd.read_csv(test_csv)
df_train['hf_split'], df_test['hf_split'] = 'train', 'test'
df_master = pd.concat([df_train, df_test], ignore_index=True)

# Scenario B: User manually downloaded ONLY Train CSV
elif os.path.exists(train_csv):
logger.warning("Found local train CSV, but test is missing. Using train only.")
df_master = pd.read_csv(train_csv)
df_master['hf_split'] = 'train'

# Scenario C: User manually downloaded the Master CSV
elif os.path.exists(full_csv):
logger.info(f"Found master CSV: {full_csv}.")
df_master = pd.read_csv(full_csv)
if 'hf_split' not in df_master.columns:
df_master['hf_split'] = 'unknown'

# Scenario D: Fallback to Hugging Face API
else:
try:
from datasets import load_dataset
logger.info("Local metadata not found. Fetching from tsnngw/CaReSound...")
dataset = load_dataset("tsnngw/CaReSound")

df_train = dataset['train'].to_pandas()
df_train['hf_split'] = 'train'

# Catch in case the dataset structure changes on HF
if 'test' in dataset:
df_test = dataset['test'].to_pandas()
df_test['hf_split'] = 'test'
df_master = pd.concat([df_train, df_test], ignore_index=True)
else:
df_master = df_train

except ImportError:
logger.error("The 'datasets' library is required. Run: pip install datasets")
raise
except Exception as e:
logger.error(f"Failed to fetch metadata: {e}")
raise

# ---> MINIMAL FIX: Inject audio paths right before saving <---
audio_map = {}
for path in Path(root).rglob("*.wav"):
stem, path_str = path.stem, str(path).lower()
source = "Unknown"
if "icbhi" in path_str: source = "ICBHI"
elif "circor" in path_str: source = "CirCor"
elif "kauh" in path_str: source = "KAUH"
elif "spr" in path_str: source = "SPRSound"
elif "zch" in path_str: source = "ZCHSound"

audio_map[(source, stem)] = str(path.absolute())
audio_map[(source, stem.split('_')[0])] = str(path.absolute())

df_master['audio_path'] = df_master.apply(
lambda r: audio_map.get((str(r.get('dataset', 'Unknown')), str(r.get('patient_id', ''))), ""),
axis=1
)

# Save the final CSV for the new PyHealth Engine to pick up automatically
df_master.to_csv(output_path, index=False)
logger.info(f"Saved {len(df_master)} QA pairs with mapped audio to {output_path}")

def _resolve_audio_paths(self, root: str) -> Dict[tuple, str]:
"""Maps .wav files using robust stem and prefix mapping."""
audio_map = {}
wav_files = list(Path(root).rglob("*.wav"))

if not wav_files:
logger.warning(f"No .wav files found in {root}.")
return audio_map

for path in wav_files:
stem = path.stem
path_str = str(path).lower()

source = "Unknown"
if "icbhi" in path_str: source = "ICBHI"
elif "circor" in path_str: source = "CirCor"
elif "kauh" in path_str: source = "KAUH"
elif "spr" in path_str: source = "SPRSound"
elif "zch" in path_str: source = "ZCHSound"

# 1. Primary Mapping: Exact filename match
audio_map[(source, stem)] = str(path.absolute())

# 2. Fallback Mapping: Base Patient ID match (e.g., '101' from '101_1b1')
base_id = stem.split('_')[0]
if (source, base_id) not in audio_map:
audio_map[(source, base_id)] = str(path.absolute())

return audio_map

def parse_func(self) -> Dict[str, Any]:
"""Merges tabular QA metadata with local audio paths."""
csv_path = os.path.join(self.root, "caresound_metadata.csv")
df = pd.read_csv(csv_path)

patients = {}
missing_sources = set()

for _, row in df.iterrows():
pid = str(row['patient_id'])
source = str(row['dataset'])

audio_path = self.audio_path_map.get((source, pid))

if not audio_path:
missing_sources.add(source)
continue

if pid not in patients:
patients[pid] = {"patient_id": pid, "visits": {}}

visit_id = f"{source}_{pid}"
if visit_id not in patients[pid]["visits"]:
patients[pid]["visits"][visit_id] = {
"visit_id": visit_id,
"audio_path": audio_path,
"events": []
}

patients[pid]["visits"][visit_id]["events"].append({
"question": row.get('question', ''),
"answer": row.get('answer', ''),
"hf_split": row.get('hf_split', 'unknown')
})

if missing_sources:
logger.warning(
f"Audio files missing for datasets: {', '.join(missing_sources)}. "
"Only available multi-modal samples have been loaded."
)

return patients

@property
def default_task(self):
from pyhealth.tasks import CaReSoundAQA
return CaReSoundAQA()
12 changes: 12 additions & 0 deletions pyhealth/datasets/configs/caresound.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
version: "1.0"
tables:
metadata:
file_path: "caresound_metadata.csv"
patient_id: "patient_id"
timestamp: null
attributes:
- "dataset"
- "question"
- "answer"
- "hf_split"
- "audio_path"
1 change: 1 addition & 0 deletions pyhealth/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .base_task import BaseTask
from .benchmark_ehrshot import BenchmarkEHRShot
from .cancer_survival import CancerMutationBurden, CancerSurvivalPrediction
from .caresound_tasks import CaReSoundAQA
from .bmd_hs_disease_classification import BMDHSDiseaseClassification
from .cardiology_detect import (
cardiology_isAD_fn,
Expand Down
57 changes: 57 additions & 0 deletions pyhealth/tasks/caresound_tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Audio Question Answering tasks for PyHealth.

This module provides tasks for processing generative text answers
based on medical audio signals using the CaReSound dataset.
"""

from typing import Any, Dict, List
from .base_task import BaseTask


class CaReSoundAQA(BaseTask):
"""Task for Audio Question Answering on respiratory and cardiac sounds.

Attributes:
task_name (str): The name of the task.
input_schema (Dict[str, str]): Required inputs (audio_path, question).
output_schema (Dict[str, str]): Required outputs (answer).
"""
task_name: str = "CaReSoundAQA"
input_schema: Dict[str, str] = {
"question": "text",
}
output_schema: Dict[str, str] = {"answer": "text"}

def _safe_str(self, value: Any, default: str = "") -> str:
"""Safely convert value to string, handling None and NaN."""
if value is None or str(value).lower() == "nan":
return default
return str(value)

def __call__(self, patient: Any) -> List[Dict[str, Any]]:
"""Process a patient record to extract audio-QA samples."""
samples: List[Dict[str, Any]] = []
events = patient.get_events("metadata")

for event in events:
# The new engine puts CSV columns into attr_dict
attr = getattr(event, "attr_dict", {})

audio_path = str(attr.get("audio_path", ""))
question = str(attr.get("question", ""))
answer = str(attr.get("answer", ""))
hf_split = str(attr.get("hf_split", "unknown"))

if not audio_path or not question or not answer:
continue

samples.append({
"patient_id": patient.patient_id,
"visit_id": f"v_{patient.patient_id}",
"audio_path": audio_path,
"question": question,
"answer": answer,
"original_hf_split": hf_split,
})

return samples
13 changes: 13 additions & 0 deletions test-resources/caresound/datasets/CaReSoundQA_train.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
patient_id,question,answer,dataset
65109516,Were any abnormal lung sounds noted during auscultation?,"No, the lungs were normal during auscultation.",SPRSound
ZCH0810,What specific type of defect is indicated in the diagnosis?,A ventricular septal defect is indicated in the diagnosis.,ZCHSound
147,What is the diagnosis based on the auscultation findings?,COPD,ICBHI
159,Are crackles present in the anterior right chest location?,"No, crackles are not present in the anterior right chest location.",ICBHI
85172,Is the murmur heard more prominently at any particular valve area?,"Yes, the murmur is most audible at the pulmonic valve area.",CirCor
BP50,Where was the normal respiratory sound heard?,Posterior Right Lower,KAUH
DP83,Where was the sound located during auscultation?,Anterior Right Upper,KAUH
EP31,Where is the location of the auscultation?,Posterior Lower Middle,KAUH
ZCH1062,Is there any abnormality detected in the cardiac auscultation findings?,"No, the cardiac auscultation findings are normal.",ZCHSound
154,Is there evidence of wheezing in the patient's auscultation?,"No, there is no evidence of wheezing.",ICBHI
ZCH0125,Are there any abnormalities in the heart sounds?,"No, the heart sounds are normal.",ZCHSound
203,What is the diagnosis based on auscultation?,COPD,ICBHI
Loading