diff --git a/docs/api/tasks.rst b/docs/api/tasks.rst
index 399b8f1aa..4a3de79f2 100644
--- a/docs/api/tasks.rst
+++ b/docs/api/tasks.rst
@@ -229,3 +229,4 @@ Available Tasks
     Mutation Pathogenicity (COSMIC) <tasks/pyhealth.tasks.MutationPathogenicityPrediction>
     Cancer Survival Prediction (TCGA) <tasks/pyhealth.tasks.CancerSurvivalPrediction>
     Cancer Mutation Burden (TCGA) <tasks/pyhealth.tasks.CancerMutationBurden>
+    Discharge Note Summarization (MIMIC-IV) <tasks/pyhealth.tasks.DischargeNoteSummarization>
diff --git a/docs/api/tasks/pyhealth.tasks.DischargeNoteSummarization.rst b/docs/api/tasks/pyhealth.tasks.DischargeNoteSummarization.rst
new file mode 100644
index 000000000..916a12c9f
--- /dev/null
+++ b/docs/api/tasks/pyhealth.tasks.DischargeNoteSummarization.rst
@@ -0,0 +1,7 @@
+pyhealth.tasks.DischargeNoteSummarization
+=======================================
+
+.. autoclass:: pyhealth.tasks.discharge_note_summarization.DischargeNoteSummarization
+    :members:
+    :undoc-members:
+    :show-inheritance:
\ No newline at end of file
diff --git a/examples/discharge__summary_samples.ipynb b/examples/discharge__summary_samples.ipynb
new file mode 100644
index 000000000..c11f1bc53
--- /dev/null
+++ b/examples/discharge__summary_samples.ipynb
@@ -0,0 +1,3152 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9QuP-XLwAF3w"
+      },
+      "source": [
+        "# Generate cleaned Discharge Summary Samples using DischargeNoteSummarization Task\n",
+        "\n",
+        "This notebook demonstrates the usage of MIMIC-IV Note dataset and DischargeNoteSummarizationTask to generate discharge summary samples for LLM training."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "4Hj9Zi4v2Nis"
+      },
+      "outputs": [],
+      "source": [
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "eWj28Ms7AEO9"
+      },
+      "source": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "MUAyQZQnFbbv"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "from pyhealth.datasets import MIMIC4Dataset\n",
+        "from pyhealth.tasks import BaseTask\n",
+        "from pyhealth.data import Patient\n",
+        "from typing import List, Dict, Any\n",
+        "from pyhealth.processors import TextProcessor\n",
+        "import argparse\n",
+        "import random\n",
+        "import pandas as pd\n",
+        "from pathlib import Path\n",
+        "\n",
+        "pd.options.mode.chained_assignment = None\n",
+        "import re\n",
+        "import pickle\n",
+        "import nltk\n",
+        "from collections import Counter\n",
+        "from tqdm import tqdm\n",
+        "import string"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "738h8qAMA5Fs"
+      },
+      "source": [
+        "# Initialize the MIMI4Dataset using the note data downloaded from Physionet website.\n",
+        "\n",
+        "Name of dataset used is discharge.csv.gz from Physionet : https://physionet.org/content/ann-pt-summ/1.0.1/"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "lRIRtrhQNKS2",
+        "outputId": "886f4286-e412-4e3a-9f91-bd24c329d397"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Memory usage Starting MIMIC4Dataset init: 882.8 MB\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.mimic4:Memory usage Starting MIMIC4Dataset init: 882.8 MB\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Initializing mimic4 dataset from None|/content/drive/MyDrive/llm_data/|None (dev mode: False)\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Initializing mimic4 dataset from None|/content/drive/MyDrive/llm_data/|None (dev mode: False)\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "No cache_dir provided. Using default cache dir: /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:No cache_dir provided. Using default cache dir: /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Initializing MIMIC4NoteDataset with tables: ['discharge'] (dev mode: False)\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.mimic4:Initializing MIMIC4NoteDataset with tables: ['discharge'] (dev mode: False)\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Using default note config: /usr/local/lib/python3.12/dist-packages/pyhealth/datasets/configs/mimic4_note.yaml\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.mimic4:Using default note config: /usr/local/lib/python3.12/dist-packages/pyhealth/datasets/configs/mimic4_note.yaml\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Memory usage Before initializing mimic4_note: 882.9 MB\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.12/dist-packages/pyhealth/datasets/mimic4.py:121: UserWarning: Events from discharge table only have date timestamp (no specific time). This may affect temporal ordering of events.\n",
+            "  warnings.warn(\n",
+            "INFO:pyhealth.datasets.mimic4:Memory usage Before initializing mimic4_note: 882.9 MB\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Initializing mimic4_note dataset from /content/drive/MyDrive/llm_data/ (dev mode: False)\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Initializing mimic4_note dataset from /content/drive/MyDrive/llm_data/ (dev mode: False)\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Using provided cache_dir: /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/cf4117bc-6d03-5673-a78c-162795de42ea\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Using provided cache_dir: /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/cf4117bc-6d03-5673-a78c-162795de42ea\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Memory usage After initializing mimic4_note: 883.0 MB\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.mimic4:Memory usage After initializing mimic4_note: 883.0 MB\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Memory usage After Note dataset initialization: 883.0 MB\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.mimic4:Memory usage After Note dataset initialization: 883.0 MB\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Memory usage Completed MIMIC4Dataset init: 883.0 MB\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.mimic4:Memory usage Completed MIMIC4Dataset init: 883.0 MB\n"
+          ]
+        }
+      ],
+      "source": [
+        "full_note_dataset = MIMIC4Dataset(\n",
+        "    note_root='/content/drive/llm_data/',\n",
+        "    note_tables=[\"discharge\"]\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Usgcv8CU4cj2"
+      },
+      "outputs": [],
+      "source": [
+        "full_note_dataset.stats()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hhcWPLJmBgOQ"
+      },
+      "source": [
+        "# Print an event using a patient id"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aacWVtZdnsoA"
+      },
+      "outputs": [],
+      "source": [
+        "print(full_note_dataset.get_patient('10000032').get_events())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xyqAklNtB_t0"
+      },
+      "source": [
+        "# Define the DischargeNoteSummarization Task\n",
+        "\n",
+        "Create DischargeNoteSummarization class , initialize the input and output schema.\n",
+        "Extract specific sections \"Brief Hospital Course\" and \"Discharge Instructions\". Clean the samples to remove extra spaces and new lines to create a paragraph for each sample text.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "31loQfr6nSRf"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "from typing import Dict, List, Any, Tuple, Union\n",
+        "\n",
+        "class DischargeNoteSummarization(BaseTask):\n",
+        "    task_name: str = \"DischargeNoteSummarization\"\n",
+        "\n",
+        "    input_schema: Dict[str , str] = {\n",
+        "      \"subject_id\" : \"text\",\n",
+        "      \"hadm_id\": \"text\",\n",
+        "      \"text\": \"text\"\n",
+        "    }\n",
+        "\n",
+        "    output_schema: Dict[str, str] = {\n",
+        "        \"brief_hospital_course\": \"text\",\n",
+        "        \"summary\": \"text\"\n",
+        "    }\n",
+        "\n",
+        "\n",
+        "    def __call__(self, patient: Patient) -> List[Dict[str, Any]]:\n",
+        "      samples = []\n",
+        "      subject_id = patient.patient_id\n",
+        "      for dis in patient.get_events(\"discharge\"):\n",
+        "\n",
+        "          textNote = dis.attr_dict['text']\n",
+        "          hadm_id = dis.attr_dict['hadm_id']\n",
+        "\n",
+        "          ## Extract the brief_hospital_course\n",
+        "\n",
+        "          start = textNote.find(\"Brief Hospital Course:\")\n",
+        "          if start < 0:\n",
+        "              #brief_hospital_course = None\n",
+        "              continue\n",
+        "          end = textNote.find(\"Medications on Admission:\")\n",
+        "          if end == -1:\n",
+        "              end = textNote.find(\"Discharge Medications:\")\n",
+        "          if end == -1:\n",
+        "              end = textNote.find(\"Discharge Disposition:\")\n",
+        "          if end == 0 or start >= end:\n",
+        "              continue\n",
+        "          brief_hospital_course = textNote[start: end].replace('\\n',  ' ')\n",
+        "          brief_hospital_course = ' '.join(brief_hospital_course.split())\n",
+        "          # Quality check\n",
+        "          num_words = len(textNote.split(' '))\n",
+        "          \n",
+        "          #extract the summary\n",
+        "          start = textNote.find(\"Discharge Instructions:\")\n",
+        "          end = textNote.find(\"Followup Instructions:\")\n",
+        "          if start < 0 or end < 0:\n",
+        "              continue\n",
+        "          summary = textNote[start: end].replace('\\n',  ' ')\n",
+        "          summary = ' '.join(summary.split())\n",
+        "          if len(summary) == 0 or len(summary) < 350:\n",
+        "            continue\n",
+        "          summary = summary.strip()\n",
+        "\n",
+        "\n",
+        "\n",
+        "          samples.append({\n",
+        "            \"text\":textNote,\n",
+        "            \"brief_hospital_course\": brief_hospital_course,\n",
+        "            \"summary\" : summary,\n",
+        "            \"subject_id\" : subject_id,\n",
+        "            \"hadm_id\": hadm_id\n",
+        "          })\n",
+        "\n",
+        "      return samples"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8DqgPHBSrXdm"
+      },
+      "outputs": [],
+      "source": [
+        "! rm -r /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/tasks/PatientNoteProcessingTask_46bb372d-34eb-5a38-bd99-ca6f30f0f026/samples_cdbbc602-34e2-5a41-8643-4c76b08829f6.ld"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7iZFJlJKDEyW"
+      },
+      "source": [
+        "# Run the Discharge Note Summarization Task\n",
+        "\n",
+        "Run the DischargeNoteSummarization Task with 4 workers and note dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 905
+        },
+        "id": "5Lh3QhOUqCWe",
+        "outputId": "5ddfc211-28af-4c22-dbda-e04f368d7b2e"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Setting task PatientNoteProcessingTask for mimic4 base dataset...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Setting task PatientNoteProcessingTask for mimic4 base dataset...\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Task cache paths: task_df=/root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/tasks/PatientNoteProcessingTask_46bb372d-34eb-5a38-bd99-ca6f30f0f026/task_df.ld, samples=/root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/tasks/PatientNoteProcessingTask_46bb372d-34eb-5a38-bd99-ca6f30f0f026/samples_cdbbc602-34e2-5a41-8643-4c76b08829f6.ld\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Task cache paths: task_df=/root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/tasks/PatientNoteProcessingTask_46bb372d-34eb-5a38-bd99-ca6f30f0f026/task_df.ld, samples=/root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/tasks/PatientNoteProcessingTask_46bb372d-34eb-5a38-bd99-ca6f30f0f026/samples_cdbbc602-34e2-5a41-8643-4c76b08829f6.ld\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Applying task transformations on data with 4 workers...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Applying task transformations on data with 4 workers...\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Incomplete parquet cache at /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/global_event_df.parquet (directory exists but contains no parquet files). Removing and rebuilding.\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "WARNING:pyhealth.datasets.base_dataset:Incomplete parquet cache at /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/global_event_df.parquet (directory exists but contains no parquet files). Removing and rebuilding.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "No cached event dataframe found. Creating: /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/global_event_df.parquet\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:No cached event dataframe found. Creating: /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/global_event_df.parquet\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Combining data from note dataset\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.mimic4:Combining data from note dataset\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Scanning table: discharge from /content/drive/MyDrive/llm_data/note/discharge.csv.gz\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Scanning table: discharge from /content/drive/MyDrive/llm_data/note/discharge.csv.gz\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Creating combined dataframe\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.mimic4:Creating combined dataframe\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Caching event dataframe to /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/global_event_df.parquet...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Caching event dataframe to /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/global_event_df.parquet...\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Detected Jupyter notebook environment, setting num_workers to 1\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Detected Jupyter notebook environment, setting num_workers to 1\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Single worker mode, processing sequentially\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Single worker mode, processing sequentially\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Worker 0 started processing 145914 patients. (Polars threads: 2)\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Worker 0 started processing 145914 patients. (Polars threads: 2)\n",
+            "  0%|          | 0/145914 [00:00<?, ?it/s]"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Rank 0 inferred the following `['bytes']` data format.\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "  2%|▏         | 2432/145914 [00:04<05:47, 412.79it/s]/usr/local/lib/python3.12/dist-packages/litdata/streaming/writer.py:284: UserWarning: An item was larger than the target chunk size (64.0 MB). The current chunk will be 64.0 MB in size.\n",
+            "  warnings.warn(\n",
+            "100%|██████████| 145914/145914 [04:42<00:00, 517.04it/s]"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Worker 0 finished processing patients.\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "INFO:pyhealth.datasets.base_dataset:Worker 0 finished processing patients.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Fitting processors on the dataset...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Fitting processors on the dataset...\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Processing samples and saving to /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/tasks/PatientNoteProcessingTask_46bb372d-34eb-5a38-bd99-ca6f30f0f026/samples_cdbbc602-34e2-5a41-8643-4c76b08829f6.ld...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Processing samples and saving to /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/tasks/PatientNoteProcessingTask_46bb372d-34eb-5a38-bd99-ca6f30f0f026/samples_cdbbc602-34e2-5a41-8643-4c76b08829f6.ld...\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Applying processors on data with 4 workers...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Applying processors on data with 4 workers...\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Detected Jupyter notebook environment, setting num_workers to 1\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Detected Jupyter notebook environment, setting num_workers to 1\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Single worker mode, processing sequentially\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Single worker mode, processing sequentially\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Worker 0 started processing 257238 samples. (0 to 257238)\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Worker 0 started processing 257238 samples. (0 to 257238)\n",
+            "  0%|          | 0/257238 [00:00<?, ?it/s]"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Rank 0 inferred the following `['str', 'str', 'str', 'str', 'str']` data format.\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "  1%|▏         | 3584/257238 [00:00<00:22, 11325.59it/s]/usr/local/lib/python3.12/dist-packages/litdata/streaming/writer.py:284: UserWarning: An item was larger than the target chunk size (64.0 MB). The current chunk will be 64.0 MB in size.\n",
+            "  warnings.warn(\n",
+            "100%|██████████| 257238/257238 [00:51<00:00, 5013.98it/s]"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Worker 0 finished processing samples.\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "INFO:pyhealth.datasets.base_dataset:Worker 0 finished processing samples.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Cached processed samples to /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/tasks/PatientNoteProcessingTask_46bb372d-34eb-5a38-bd99-ca6f30f0f026/samples_cdbbc602-34e2-5a41-8643-4c76b08829f6.ld\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:pyhealth.datasets.base_dataset:Cached processed samples to /root/.cache/pyhealth/98de9a11-0af5-5cd9-81f2-2da31802c232/tasks/PatientNoteProcessingTask_46bb372d-34eb-5a38-bd99-ca6f30f0f026/samples_cdbbc602-34e2-5a41-8643-4c76b08829f6.ld\n"
+          ]
+        }
+      ],
+      "source": [
+        "task = DischargeNoteSummarization()\n",
+        "processed_dataset = full_note_dataset.set_task(task ,num_workers=4)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "hg2ZcXs3zh7E",
+        "outputId": "fa9b4909-c72d-4f7a-dc2e-7132f7458c4c"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "<class 'pyhealth.datasets.sample_dataset.SampleDataset'>\n"
+          ]
+        }
+      ],
+      "source": [
+        "\n",
+        "mimic_df = pd.DataFrame(processed_dataset)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "iVdEXb_gDdGw"
+      },
+      "source": [
+        "# Print the dataframe head"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "cu-s0F4rkRHm",
+        "outputId": "5f78d170-f7f5-4136-b1bd-9b10190332e5"
+      },
+      "outputs": [],
+      "source": [
+        "print(mimic_df.head())"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "HLrS_fpLkwz8",
+        "outputId": "791bc340-3f3c-4bef-d733-f499627e8f38"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "740"
+            ]
+          },
+          "execution_count": 12,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "len(mimic_df.iloc[1]['summary'])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IUA4wtrjDtMJ"
+      },
+      "source": [
+        "# Perform further processing on the dataframe\n",
+        "\n",
+        "Run more data cleaning tasks on the mimic_df"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 49,
+          "referenced_widgets": [
+            "4002f3c605c648409776ec137ccff2bf",
+            "008bd292f7a9429eb3c7570fbe9fd093",
+            "46d831dc293444229a86184c910243a0",
+            "68ae05d25d064e0b8bd39fadb955311e",
+            "cdd7dd43d43d4bed8d5b401b5e242696",
+            "20fae152868747c2a175d48936cde9ad",
+            "a31b0d30e44d469c828cb18393803334",
+            "66a3ab0b3a484264af941300ac647d6f",
+            "b9632797c62b454fb347a8fecbb226f7",
+            "e13239bfb1214cf398f9d1d2b303f105",
+            "9da0e08960a2462cb9f1ba55656b8116"
+          ]
+        },
+        "id": "qK-wvZ4D2VLX",
+        "outputId": "c51d6460-b4d8-4ef9-ab6c-21f828f78b48"
+      },
+      "outputs": [],
+      "source": [
+        "import swifter\n",
+        "re_service = re.compile(r'^Service: (.*)$', re.IGNORECASE|re.MULTILINE)  # Either after Serive:\n",
+        "re_service_extra = re.compile(r'^Date of Birth:.*Sex:\\s{0,10}\\w\\s{0,10}___: (.*)$', re.IGNORECASE|re.MULTILINE)  # Fallback if deidentified\n",
+        "\n",
+        "mimic_df['service'] = mimic_df['text'].swifter.apply(lambda s: re_service.search(s).group(1) if re_service.search(s) is not None else None)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 49,
+          "referenced_widgets": [
+            "81495e8a1be5469a87d0bb300e82688d",
+            "db208ab02eb6476aa0d1d3bb4349bd54",
+            "27653f3de94446a69e8d55de32bb8e6f",
+            "ee6409daec7b495dae8e08b2f89ffcea",
+            "280591f53ff74b4cb18851435e5916e2",
+            "483af853bd7047f4b51ea31f9267276d",
+            "199dcfce4cf24a8b9153fe2c3c5b4914",
+            "5dcf36ab5c474cc792718ed10f3d5571",
+            "52c4ae1316c24e9599e86af27d14b394",
+            "4e91f191ce6242c9a2fe3f1b1b7aec24",
+            "3d4731460a9549258137e20e10e2ed75"
+          ]
+        },
+        "id": "ktnXJRz0a5B4",
+        "outputId": "16bf4c3b-4aa5-4c89-b837-f7f35dbcffb1"
+      },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "81495e8a1be5469a87d0bb300e82688d",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Pandas Apply:   0%|          | 0/183 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "mimic_df.loc[~mimic_df['service'].notnull(), 'service'] = mimic_df.loc[~mimic_df['service'].notnull(), 'text'].swifter.apply(lambda s: re_service_extra.search(s).group(1) if re_service_extra.search(s) is not None else None)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aoFW8T8ca8v4"
+      },
+      "outputs": [],
+      "source": [
+        "mimic_df.loc[~mimic_df['service'].notnull(), 'service'] = ''\n",
+        "mimic_df['service'] = mimic_df['service'].str.strip()\n",
+        "mimic_df['service'] = mimic_df['service'].str.strip(string.punctuation)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 293
+        },
+        "id": "uVpZFkh92foh",
+        "outputId": "21c727e5-62ef-4965-a18d-2345913bb777"
+      },
+      "outputs": [],
+      "source": [
+        "mimic_df.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "OmjNF0_N-nGo",
+        "outputId": "af220df1-4aed-46b8-8bd7-6b99380c115b"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "  Remove all leading and trailing whitespaces in each line of text\n"
+          ]
+        }
+      ],
+      "source": [
+        "mimic_df.loc[:, 'summary'] = mimic_df['summary'].str.strip()\n",
+        "print(\"  Remove all leading and trailing whitespaces in each line of text\")\n",
+        "mimic_df.loc[:, 'summary'] = mimic_df['summary'].apply(lambda s: '\\n'.join([x.strip() for x in s.split('\\n')]))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "hZwMqXL1cYnt"
+      },
+      "outputs": [],
+      "source": [
+        "def change_why_what_next_pattern_to_text(summaries):\n",
+        "    \"\"\" Change all occurrences of the static why, what, next pattern occuring in many MIMIC summaries to fluent text. \"\"\"\n",
+        "\n",
+        "    # Determine random string used instead of headings\n",
+        "    random_string = ''.join(random.choices(string.ascii_uppercase + string.digits, k=20)) + '\\n- '  # Replace removed dash\n",
+        "    #summaries = summaries.apply(lambda s: WHY_WHAT_NEXT_HEADINGS_DASHED_LIST.sub(random_string, s))\n",
+        "    # Now replace all items after the random string, end of paragraph marked by double newline\n",
+        "    dash_regex = re.compile(r'(?:\\.)?\\n-\\s{0,4}', re.MULTILINE|re.IGNORECASE)  # Also removes\n",
+        "    # Filter summaries that contain random_string, replace dash with fullstop and whitespace\n",
+        "    def remove_dash_from_paragraphs(s):\n",
+        "        paragraphs = s.split(random_string)\n",
+        "        # For each paragraph remove all dashes until \\n\\n\n",
+        "        res = [paragraphs[0]]\n",
+        "        paragraphs = paragraphs[1:]\n",
+        "        for p in paragraphs:\n",
+        "            items = p.split('\\n\\n', 1)[0]\n",
+        "            items = '. '.join(dash_regex.split(items))\n",
+        "            if '\\n\\n' in p:\n",
+        "                items = items + '\\n\\n' + p.split('\\n\\n', 1)[1]\n",
+        "            res.append(items.strip())\n",
+        "        return '\\n\\n'.join(res)\n",
+        "    summaries = summaries.apply(lambda s: remove_dash_from_paragraphs(s) if random_string in s else s)\n",
+        "    return summaries"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "TXoNOBRhcP6k",
+        "outputId": "ba5801ea-100e-444a-ec9c-e31488cdb0f5"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "  Change Why in hospital / What was done / What next pattern into fluent text.\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(f\"  Change Why in hospital / What was done / What next pattern into fluent text.\")\n",
+        "mimic_df.loc[:, 'summary'] = change_why_what_next_pattern_to_text(mimic_df['summary'])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "fA0fbTAVcmly",
+        "outputId": "c47b4d84-8733-48ab-fa83-db472d45796d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "  Remove 0 list template headings.\n"
+          ]
+        }
+      ],
+      "source": [
+        "import regular_expressions\n",
+        "\n",
+        "# Remove all subheadings that are not followed by a list\n",
+        "subheading_regex = re.compile(regular_expressions.WHY_WHAT_NEXT_HEADINGS, re.MULTILINE|re.IGNORECASE)\n",
+        "print(f\"  Remove {mimic_df['summary'].apply(lambda s: subheading_regex.findall(s)).apply(len).sum()} list template headings.\")\n",
+        "mimic_df.loc[:, 'summary'] = mimic_df['summary'].apply(lambda s: subheading_regex.sub('\\n', s))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "dHu_4r5TsqJ2",
+        "outputId": "d80f8454-5170-45de-c863-27b5fd5afe37"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "  Remove all newlines in continuous text\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(\"  Remove all newlines in continuous text\")\n",
+        "mimic_df.loc[:, 'summary'] = mimic_df['summary'].apply(lambda s: regular_expressions.re_newline_in_text.sub(' ', s))\n",
+        "mimic_df.loc[:, 'summary'] = mimic_df['summary'].apply(lambda s: regular_expressions.re_multiple_whitespace.sub(' ', s))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "FE_pMuCRtA5l",
+        "outputId": "179c6355-aa20-4c11-fefa-bb89c784b498"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "    Replace 5306 ___ with 'You '.\n",
+            "    Replace 308 ___ with ' you'.\n",
+            "    Replace 327 ___ with ' your '.\n",
+            "    Replace 0 ___ with ' your '.\n"
+          ]
+        }
+      ],
+      "source": [
+        "for replacement, regex in regular_expressions.SIMPLE_DEIDENTIFICATION_PATTERNS:\n",
+        "  # Print number of replacements\n",
+        "  print(f\"    Replace {mimic_df['summary'].apply(lambda s: len(regex.findall(s))).sum()} ___ with \\'{replacement}\\'.\")\n",
+        "  mimic_df.loc[:, 'summary'] = mimic_df['summary'].apply(lambda s: re.sub(regex, replacement, s))\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "TqHSf1gZvUTN",
+        "outputId": "d8b4598e-bb44-44cc-8450-448ac4bcda33"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "  Removed 0 summaries with more than 5 double newlines.\n"
+          ]
+        }
+      ],
+      "source": [
+        "old_len = len(mimic_df)\n",
+        "max_double_newlines = 5\n",
+        "mimic_df = mimic_df[mimic_df['summary'].map(lambda s: s.count('\\n\\n')) <= max_double_newlines]\n",
+        "print(f\"  Removed {old_len - len(mimic_df)} summaries with more than {max_double_newlines} double newlines.\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "hW9vsy2RyRcT",
+        "outputId": "24f6fbbb-9358-4677-d88c-762a8e8eda47"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n",
+            "[nltk_data]   Unzipping tokenizers/punkt_tab.zip.\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "True"
+            ]
+          },
+          "execution_count": 28,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "import nltk\n",
+        "nltk.download('punkt_tab')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 66,
+          "referenced_widgets": [
+            "c704f77cc23947acae10d4186fd5cae0",
+            "44de707e6ec743c7b14dfad33f594ff2",
+            "32063c6f01d54d208d201e214e222508",
+            "0f04dc4307fc4be99b26ab9376133de6",
+            "25b9440095c646328eddc34fa891a9f8",
+            "778d3ea9350441e0bd535243efb97d8a",
+            "2d59da93e86243e7aea1b96ab90f0683",
+            "f06ca316976a49858f15913354c2113c",
+            "3b5da98abc394cb6b2b190212442c4f6",
+            "83289e583a4045d58b6bbde6e00620d7",
+            "8c1532818de347f5903c86483a6b8297"
+          ]
+        },
+        "id": "EybbLvNjx2se",
+        "outputId": "d5114a9d-113e-4846-a3b6-b68665903aff"
+      },
+      "outputs": [],
+      "source": [
+        "mimic_df['sentences'] = mimic_df['summary'].swifter.apply(lambda s: list(nltk.sent_tokenize(s)))\n",
+        "mimic_df = mimic_df[mimic_df['sentences'].map(len) >= 3]\n",
+        "print(f\"  Removed {old_len - len(mimic_df)} summaries with less than 3 sentences.\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 66,
+          "referenced_widgets": [
+            "5f77f53997744569aa6d09f758026afd",
+            "ad18bfabf116422a878bb74608daeb2d",
+            "e27e54ac631644678726ee2edb2b6236",
+            "43f577596fdf47bb9eb46a372e2bfc84",
+            "e2071364bdeb49dabbac2da5d7beb505",
+            "d045bdf233574808a63194e900bc1551",
+            "b6dcb898478c4920bf0b8215dc15f679",
+            "243d576b442d47d8ba4e2d1b9f140d7f",
+            "ce70cdf766a14c6cbe982314afebff47",
+            "47dd58da2e29401db04ad7f5e7dc80af",
+            "84c8d429e6fc405fb73aae1b342a116c"
+          ]
+        },
+        "id": "VaFfqcruyBAv",
+        "outputId": "967b99b6-f2ac-43d5-dcad-cbc0d0b2e191"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "  Combine all sentences with single whitespaces.\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "5f77f53997744569aa6d09f758026afd",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Pandas Apply:   0%|          | 0/251248 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "print(f\"  Combine all sentences with single whitespaces.\")\n",
+        "mimic_df.loc[:, 'summary'] = mimic_df['sentences'].swifter.apply(lambda s: regular_expressions.re_whitespace.sub(' ', ' '.join(s)))\n",
+        "mimic_df.drop(columns=['sentences'], inplace=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 66,
+          "referenced_widgets": [
+            "03866e73003540569eec57a1ec993ad9",
+            "fa329e9affc047b6b21d1d7300f8daec",
+            "567bfc9f89064ebf9708f37da1088c9f",
+            "d54ec16ae3de43b2bd7e4d776e646117",
+            "bb4ca518e85749db9be6f92cba3fcbf5",
+            "e0f3d3b983be49259edaeacb70f13b62",
+            "50ac02c1171f4b5888a8e96163e944d7",
+            "7ee209ce623047c99716e61d9d8bd27b",
+            "fd4e7f1eb0d949dea14c6f1bb369e34a",
+            "dd142ad774ae4be694894248cf4f682b",
+            "d017a0cdd0624f32ae11fdf889488d17"
+          ]
+        },
+        "id": "nfuV-xqI4G5p",
+        "outputId": "354c2116-1cc7-46c0-af2e-50e407d6955f"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "  Count occurrences of ___ in each summary.\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "03866e73003540569eec57a1ec993ad9",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Pandas Apply:   0%|          | 0/251248 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "print(f\"  Count occurrences of ___ in each summary.\")\n",
+        "mimic_df['num_deidentified'] = mimic_df['summary'].swifter.apply(lambda s: s.count('___'))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 328
+        },
+        "id": "oZHHzJZi4M8K",
+        "outputId": "517abeb2-ba7f-4763-a770-d5d6e33cad4b"
+      },
+      "outputs": [],
+      "source": [
+        "mimic_df.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "gcUSGsoU4ali",
+        "outputId": "29444c3c-c23e-40ec-9a45-42da013906c6"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "  Removed 965 summaries with more than one ___ per 10 words.\n"
+          ]
+        }
+      ],
+      "source": [
+        "num_words_per_deidentified = 10\n",
+        "old_len = len(mimic_df)\n",
+        "mimic_df = mimic_df[mimic_df['num_deidentified'] <= mimic_df['summary'].map(lambda s: len(s.split(' ')) / num_words_per_deidentified)]\n",
+        "# mimic_df = mimic_df[mimic_df['num_deidentified'] <= 10]\n",
+        "print(f\"  Removed {old_len - len(mimic_df)} summaries with more than one ___ per {num_words_per_deidentified} words.\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "D3qN7dw6aKnN"
+      },
+      "source": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "E-litWbMY4uJ",
+        "outputId": "0ea88b7d-eb1e-441f-b764-31ef8c46fa4c"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "  Removed 0 / 122083 notes from same hospital stay.\n"
+          ]
+        }
+      ],
+      "source": [
+        "\n",
+        "old_len = len(mimic_df)\n",
+        "mimic_df.drop_duplicates(subset=['subject_id'], keep='first', inplace=True)\n",
+        "print(f\"  Removed {old_len - len(mimic_df)} / {old_len} notes from same hospital stay.\")\n",
+        "old_len = len(mimic_df)\n",
+        "re_ds = re.compile(r\"Discharge Instructions:\\n\", re.IGNORECASE)\n",
+        "mimic_df = mimic_df[mimic_df['text'].str.contains(re_ds, regex=True)]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "c_soM962EfFh"
+      },
+      "source": [
+        "# Generate processed summaries and save to drive\n",
+        "\n",
+        "Save the processed summaries to drive in csv format. These cleaned summaries will be used as training data for Large Language Models to generate high quality patient summaries and to identify hallucinations."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "bDH0Y-S0-LtS",
+        "outputId": "a667739a-a6ea-46b0-d063-444a8df0289b"
+      },
+      "outputs": [],
+      "source": [
+        "outputpath = f\"{nb_path}/outputdata/\"\n",
+        "\n",
+        "print(Path(outputpath))\n",
+        "\n",
+        "print(f\"\\nOutput data to {outputpath}\")\n",
+        "# Create output directory if it does not exist\n",
+        "Path(f\"{nb_path}/outputdata\").mkdir(parents=True, exist_ok=True)\n",
+        "mimic_df.to_csv('/content/drive/outputdata/mimic_processed_summaries.csv', index=False)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ICSDknuhzCLP",
+        "outputId": "8d6e55ef-f727-4f5d-bb1d-c25fcf682357"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "122083\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(len(mimic_df))"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "008bd292f7a9429eb3c7570fbe9fd093": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_20fae152868747c2a175d48936cde9ad",
+            "placeholder": "​",
+            "style": "IPY_MODEL_a31b0d30e44d469c828cb18393803334",
+            "value": "Pandas Apply: 100%"
+          }
+        },
+        "03866e73003540569eec57a1ec993ad9": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_fa329e9affc047b6b21d1d7300f8daec",
+              "IPY_MODEL_567bfc9f89064ebf9708f37da1088c9f",
+              "IPY_MODEL_d54ec16ae3de43b2bd7e4d776e646117"
+            ],
+            "layout": "IPY_MODEL_bb4ca518e85749db9be6f92cba3fcbf5"
+          }
+        },
+        "0f04dc4307fc4be99b26ab9376133de6": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_83289e583a4045d58b6bbde6e00620d7",
+            "placeholder": "​",
+            "style": "IPY_MODEL_8c1532818de347f5903c86483a6b8297",
+            "value": " 257238/257238 [01:37&lt;00:00, 3169.76it/s]"
+          }
+        },
+        "199dcfce4cf24a8b9153fe2c3c5b4914": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "20fae152868747c2a175d48936cde9ad": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "243d576b442d47d8ba4e2d1b9f140d7f": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "25b9440095c646328eddc34fa891a9f8": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "27653f3de94446a69e8d55de32bb8e6f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_5dcf36ab5c474cc792718ed10f3d5571",
+            "max": 183,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_52c4ae1316c24e9599e86af27d14b394",
+            "value": 183
+          }
+        },
+        "280591f53ff74b4cb18851435e5916e2": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "2d59da93e86243e7aea1b96ab90f0683": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "32063c6f01d54d208d201e214e222508": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_f06ca316976a49858f15913354c2113c",
+            "max": 257238,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_3b5da98abc394cb6b2b190212442c4f6",
+            "value": 257238
+          }
+        },
+        "3b5da98abc394cb6b2b190212442c4f6": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "3d4731460a9549258137e20e10e2ed75": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "4002f3c605c648409776ec137ccff2bf": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_008bd292f7a9429eb3c7570fbe9fd093",
+              "IPY_MODEL_46d831dc293444229a86184c910243a0",
+              "IPY_MODEL_68ae05d25d064e0b8bd39fadb955311e"
+            ],
+            "layout": "IPY_MODEL_cdd7dd43d43d4bed8d5b401b5e242696"
+          }
+        },
+        "43f577596fdf47bb9eb46a372e2bfc84": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_47dd58da2e29401db04ad7f5e7dc80af",
+            "placeholder": "​",
+            "style": "IPY_MODEL_84c8d429e6fc405fb73aae1b342a116c",
+            "value": " 251248/251248 [00:30&lt;00:00, 5583.57it/s]"
+          }
+        },
+        "44de707e6ec743c7b14dfad33f594ff2": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_778d3ea9350441e0bd535243efb97d8a",
+            "placeholder": "​",
+            "style": "IPY_MODEL_2d59da93e86243e7aea1b96ab90f0683",
+            "value": "Pandas Apply: 100%"
+          }
+        },
+        "46d831dc293444229a86184c910243a0": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_66a3ab0b3a484264af941300ac647d6f",
+            "max": 257238,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_b9632797c62b454fb347a8fecbb226f7",
+            "value": 257238
+          }
+        },
+        "47dd58da2e29401db04ad7f5e7dc80af": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "483af853bd7047f4b51ea31f9267276d": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "4e91f191ce6242c9a2fe3f1b1b7aec24": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "50ac02c1171f4b5888a8e96163e944d7": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "52c4ae1316c24e9599e86af27d14b394": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "567bfc9f89064ebf9708f37da1088c9f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_7ee209ce623047c99716e61d9d8bd27b",
+            "max": 251248,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_fd4e7f1eb0d949dea14c6f1bb369e34a",
+            "value": 251248
+          }
+        },
+        "5dcf36ab5c474cc792718ed10f3d5571": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "5f77f53997744569aa6d09f758026afd": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_ad18bfabf116422a878bb74608daeb2d",
+              "IPY_MODEL_e27e54ac631644678726ee2edb2b6236",
+              "IPY_MODEL_43f577596fdf47bb9eb46a372e2bfc84"
+            ],
+            "layout": "IPY_MODEL_e2071364bdeb49dabbac2da5d7beb505"
+          }
+        },
+        "66a3ab0b3a484264af941300ac647d6f": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "68ae05d25d064e0b8bd39fadb955311e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_e13239bfb1214cf398f9d1d2b303f105",
+            "placeholder": "​",
+            "style": "IPY_MODEL_9da0e08960a2462cb9f1ba55656b8116",
+            "value": " 257238/257238 [00:01&lt;00:00, 180610.03it/s]"
+          }
+        },
+        "778d3ea9350441e0bd535243efb97d8a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "7ee209ce623047c99716e61d9d8bd27b": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "81495e8a1be5469a87d0bb300e82688d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_db208ab02eb6476aa0d1d3bb4349bd54",
+              "IPY_MODEL_27653f3de94446a69e8d55de32bb8e6f",
+              "IPY_MODEL_ee6409daec7b495dae8e08b2f89ffcea"
+            ],
+            "layout": "IPY_MODEL_280591f53ff74b4cb18851435e5916e2"
+          }
+        },
+        "83289e583a4045d58b6bbde6e00620d7": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "84c8d429e6fc405fb73aae1b342a116c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "8c1532818de347f5903c86483a6b8297": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "9da0e08960a2462cb9f1ba55656b8116": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "a31b0d30e44d469c828cb18393803334": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "ad18bfabf116422a878bb74608daeb2d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_d045bdf233574808a63194e900bc1551",
+            "placeholder": "​",
+            "style": "IPY_MODEL_b6dcb898478c4920bf0b8215dc15f679",
+            "value": "Pandas Apply: 100%"
+          }
+        },
+        "b6dcb898478c4920bf0b8215dc15f679": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "b9632797c62b454fb347a8fecbb226f7": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "bb4ca518e85749db9be6f92cba3fcbf5": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "c704f77cc23947acae10d4186fd5cae0": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_44de707e6ec743c7b14dfad33f594ff2",
+              "IPY_MODEL_32063c6f01d54d208d201e214e222508",
+              "IPY_MODEL_0f04dc4307fc4be99b26ab9376133de6"
+            ],
+            "layout": "IPY_MODEL_25b9440095c646328eddc34fa891a9f8"
+          }
+        },
+        "cdd7dd43d43d4bed8d5b401b5e242696": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "ce70cdf766a14c6cbe982314afebff47": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "d017a0cdd0624f32ae11fdf889488d17": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "d045bdf233574808a63194e900bc1551": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "d54ec16ae3de43b2bd7e4d776e646117": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_dd142ad774ae4be694894248cf4f682b",
+            "placeholder": "​",
+            "style": "IPY_MODEL_d017a0cdd0624f32ae11fdf889488d17",
+            "value": " 251248/251248 [00:00&lt;00:00, 403987.54it/s]"
+          }
+        },
+        "db208ab02eb6476aa0d1d3bb4349bd54": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_483af853bd7047f4b51ea31f9267276d",
+            "placeholder": "​",
+            "style": "IPY_MODEL_199dcfce4cf24a8b9153fe2c3c5b4914",
+            "value": "Pandas Apply: 100%"
+          }
+        },
+        "dd142ad774ae4be694894248cf4f682b": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "e0f3d3b983be49259edaeacb70f13b62": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "e13239bfb1214cf398f9d1d2b303f105": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "e2071364bdeb49dabbac2da5d7beb505": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "e27e54ac631644678726ee2edb2b6236": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_243d576b442d47d8ba4e2d1b9f140d7f",
+            "max": 251248,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_ce70cdf766a14c6cbe982314afebff47",
+            "value": 251248
+          }
+        },
+        "ee6409daec7b495dae8e08b2f89ffcea": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_4e91f191ce6242c9a2fe3f1b1b7aec24",
+            "placeholder": "​",
+            "style": "IPY_MODEL_3d4731460a9549258137e20e10e2ed75",
+            "value": " 183/183 [00:00&lt;00:00, 9918.17it/s]"
+          }
+        },
+        "f06ca316976a49858f15913354c2113c": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "fa329e9affc047b6b21d1d7300f8daec": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_e0f3d3b983be49259edaeacb70f13b62",
+            "placeholder": "​",
+            "style": "IPY_MODEL_50ac02c1171f4b5888a8e96163e944d7",
+            "value": "Pandas Apply: 100%"
+          }
+        },
+        "fd4e7f1eb0d949dea14c6f1bb369e34a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        }
+      }
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/pyhealth/tasks/__init__.py b/pyhealth/tasks/__init__.py
index 797988377..5b344f677 100644
--- a/pyhealth/tasks/__init__.py
+++ b/pyhealth/tasks/__init__.py
@@ -66,3 +66,4 @@
     VariantClassificationClinVar,
 )
 from .patient_linkage_mimic3 import PatientLinkageMIMIC3Task
+from .discharge_note_summarization import DischargeNoteSummarization
diff --git a/pyhealth/tasks/discharge_note_summarization.py b/pyhealth/tasks/discharge_note_summarization.py
new file mode 100644
index 000000000..829e5ce0b
--- /dev/null
+++ b/pyhealth/tasks/discharge_note_summarization.py
@@ -0,0 +1,147 @@
+"""
+PyHealth task for generating high quality patient summaries with LLMs while using the MIMIC4-Note dataset.
+
+Dataset link:
+    https://physionet.org/content/ann-pt-summ/1.0.1/
+
+Dataset paper: (please cite if you use this dataset)
+    Hegselmann, S., Shen, S.Z., Gierse, F., Agrawal, M., Sontag, D. and Jiang, X., 2024. 
+    A data-centric approach to generate faithful and high quality patient summaries with large language models. 
+    arXiv preprint arXiv:2402.15422.
+
+Dataset paper link:
+    https://arxiv.org/abs/2402.15422
+
+Author:
+    Vishal Vyas (vyas9@illinois.edu)
+"""
+
+import os
+import sys
+import re
+import random
+import string
+import pickle
+import argparse
+from pathlib import Path
+from typing import List, Dict, Any, Tuple, Union
+from collections import Counter
+
+import pandas as pd
+import nltk
+import swifter
+from tqdm import tqdm
+
+from pyhealth.data import Patient
+from pyhealth.tasks import BaseTask
+from pyhealth.processors import TextProcessor
+
+import logging
+
+
+pd.options.mode.chained_assignment = None
+
+logger = logging.getLogger(__name__)
+
+
+# ── Configuration ─────────────────────────────────────────────────────────────
+
+#NOTE_ROOT = "./data/llm_data/"          # Path to MIMIC-4 notes directory
+NOTE_ROOT = "/Users/vishalvyas/uiuc_cs_598/mimic-iv-note"
+OUTPUT_DIR = "./outputdata/"            # Path for output CSV
+OUTPUT_FILE = "mimic_processed_summaries.csv"
+
+MAX_DOUBLE_NEWLINES = 5                 # Max allowed double newlines in a summary
+NUM_WORDS_PER_DEIDENTIFIED = 10        # Allowed ___ density threshold
+MIN_SUMMARY_LENGTH = 350               # Minimum character length for summaries
+MIN_SENTENCES = 3                       # Minimum sentence count per summary
+
+
+# ── PyHealth Task Definition ───────────────────────────────────────────────────
+
+class DischargeNoteSummarization(BaseTask):
+
+    """
+    A PyHealth task class for generating faithful and high quality patient summaries with Large Language Models .
+
+    Attributes:
+        task_name (str): The name of the task.
+        input_schema (Dict[str, str]): The schema for the task input.
+        output_schema (Dict[str, str]): The schema for the task output.
+
+    Examples:
+        >>> from pyhealth.datasets import MIMIC4Dataset
+        >>> from pyhealth.tasks import MIMIC4Dataset
+        >>> dataset = MIMIC4Dataset(note_root=NOTE_ROOT,note_tables=["discharge"])
+        >>> task = DataforLlmSummaries()
+        >>> samples = dataset.set_task(task)
+    """
+     
+    task_name: str = "DischargeNoteSummarization"
+    input_schema: Dict[str, str] = {
+        "subject_id": "text",
+        "hadm_id": "text",
+        "text": "text"
+    }
+
+    output_schema: Dict[str, str] = {
+        "brief_hospital_course": "text",
+        "summary": "text"
+    }
+
+    def __call__(self, patient: Patient) -> List[Dict[str, Any]]:
+        """
+        Generates patient brief_hospital_course and summary samples for a single patient.
+
+        Args:
+            patient (Patient): A patient object containing at least one 'discharge' event.
+
+        Returns:
+            List[Dict]: A list containing a dictionary for each patient visit with:
+                - "text": patient clinical notes text,
+                - "brief_hospital_course": patient brief hospital course,
+                - "summary": patient discharge summary text,
+                - "subject_id": patient identifier,
+                - "hadm_id": Hospital Admission Identifier,
+                
+        """
+        samples = []
+        subject_id = patient.patient_id
+
+        for dis in patient.get_events("discharge"):
+            textNote = dis.attr_dict["text"]
+            hadm_id = dis.attr_dict["hadm_id"]
+
+            # Extract Brief Hospital Course , remove new lines and remove whitespaces to create single paragraph
+            start = textNote.find("Brief Hospital Course:")
+            if start < 0:
+                continue
+            end = textNote.find("Medications on Admission:")
+            if end == -1:
+                end = textNote.find("Discharge Medications:")
+            if end == -1:
+                end = textNote.find("Discharge Disposition:")
+            if end == 0 or start >= end:
+                continue
+            brief_hospital_course = textNote[start:end].replace("\n", " ")
+            brief_hospital_course = " ".join(brief_hospital_course.split())
+
+            # Extract Discharge Instructions (summary) and filter out samples less than MIN_SUMMARY_LENGTH
+            start = textNote.find("Discharge Instructions:")
+            end = textNote.find("Followup Instructions:")
+            if start >= 0 and end >= 0:
+                summary = textNote[start:end].replace("\n", " ")
+                summary = " ".join(summary.split())
+                
+                summary = summary.strip()
+                #Only add to samples if length of summary greater than specified MIN_SUMMARY_LENGTH
+                if len(summary) >= MIN_SUMMARY_LENGTH:
+                    samples.append({
+                        "text": textNote,
+                        "brief_hospital_course": brief_hospital_course,
+                        "summary": summary,
+                        "subject_id": subject_id,
+                        "hadm_id": hadm_id,
+                    })
+
+        return samples
\ No newline at end of file
diff --git a/tests/core/test_discharge_note_summarization.py b/tests/core/test_discharge_note_summarization.py
new file mode 100644
index 000000000..a2a8c7a19
--- /dev/null
+++ b/tests/core/test_discharge_note_summarization.py
@@ -0,0 +1,161 @@
+"""
+Unit tests for DischargeNoteSummarization task in summarization_data_processing.py.
+
+Tests cover:
+    - Class attributes (task_name, input_schema, output_schema)
+    - __call__: happy-path extraction of brief_hospital_course and summary
+    - __call__: all boundary / filtering conditions that cause samples to be skipped
+    - Output dictionary structure and field types
+
+External dependencies (pyhealth) are fully mocked so the tests run without
+installing the real library or accessing any dataset.
+
+Run with:
+    python -m pytest test_summarization_data_processing.py -v
+    # or
+    python -m unittest test_summarization_data_processing.py -v
+"""
+
+
+import unittest
+from unittest.mock import MagicMock, patch
+from pathlib import Path
+from pathlib import Path
+from pyhealth.datasets import MIMIC4Dataset
+from pyhealth.data import Patient
+import tempfile
+from pyhealth.tasks import DischargeNoteSummarization
+from unittest.mock import MagicMock
+from pyhealth.data import Patient, Event
+
+
+import logging
+
+class TestDischargeNoteSummarizationTask(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        #cls.test_resources = Path(__file__).parent.parent.parent / "test-resources" / "discharge"
+        cls.cache_dir = tempfile.TemporaryDirectory()
+        #cls.full_note_dataset = MIMIC4Dataset(
+         #   note_root=cls.test_resources,
+          #  note_tables=["discharge"])
+        cls.task = DischargeNoteSummarization()
+        #cls.sample_notes = cls.full_note_dataset.set_task(cls.task)
+        cls.MIN_SUMMARY_LENGTH = 350
+
+    def create_mock_patient(self, note_text, patient_id="p1", hadm_id="h1", subject_id="20000003"):
+        """Helper to create a mock Patient with a single discharge event."""
+        patient = MagicMock(spec=Patient)
+        patient.patient_id = patient_id
+        
+        # Create a mock Event for the discharge note
+        event = MagicMock(spec=Event)
+        event.attr_dict = {
+            "text": note_text,
+            "hadm_id": hadm_id,
+            "subject_id": subject_id
+        }
+        
+        # Mock the get_events method to return our discharge event
+        patient.get_events.side_effect = lambda event_type: [event] if event_type == "discharge" else []
+        return patient
+    
+        
+    #def test_generated_samples(self):
+     #   self.assertEqual(len(self.sample_notes), 2)
+      #  self.assertTrue(self.sample_notes[0]["summary"].startswith("Discharge Instructions:"))
+
+
+        
+
+    def test_task_metadata(self):
+        self.assertEqual(self.task.task_name,"DischargeNoteSummarization")
+        self.assertIn("text", self.task.input_schema)
+        self.assertIn("summary", self.task.output_schema)
+
+    def test_filtering_short_summary(self):
+        
+        note = (
+                "Brief Hospital Course:\n"
+                "The patient is an elderly individual with a significant past medical history of chronic obstructive "
+                "pulmonary disease, congestive heart failure with a reduced ejection fraction of thirty-five percent, "
+                "and Type 2 diabetes mellitus. The patient presented to the emergency department complaining of "
+                "progressive shortness of breath, productive cough with yellow sputum, and bilateral lower extremity "
+                "edema increasing over the last five days. Upon arrival, the patient was tachycardic and hypoxic, "
+                "requiring supplemental oxygen via nasal cannula to maintain saturations above ninety-two percent. "
+                "A chest X-ray revealed bilateral pulmonary infiltrates and pleural effusions, consistent with a "
+                "multifocal pneumonia overlaying a congestive heart failure exacerbation. Laboratory results were "
+                "significant for an elevated pro-BNP and a leukocytosis with an elevated white blood cell count. "
+                "During the first forty-eight hours of admission, the patient was started on intravenous antibiotics "
+                "for community-acquired pneumonia. Diuresis was initiated with intravenous medications, resulting in "
+                "a significant net negative fluid balance over three days. The patient’s respiratory status "
+                "improved significantly; oxygen was successfully weaned to room air by hospital day four. "
+                "Endocrinology was consulted for blood glucose management, and the insulin regimen was "
+                "adjusted to a sliding scale with a long-acting basal dose. By the day of discharge, the "
+                "patient was stable, ambulating without distress, and lung sounds were markedly clearer on "
+                "auscultation. Weight had returned to the documented baseline. "
+                
+                "Medications on Admission: "
+                "Metformin, Lisinopril, Furosemide, and an Albuterol inhaler. "
+                
+                "Discharge Instructions: "
+                "You were treated in the hospital for a combination of pneumonia and a flare-up of your heart "
+                "failure. It is vital that you finish the entire course of oral antibiotics as prescribed, "
+                "even if you feel better. Please monitor your weight every morning before breakfast. If you "
+                "notice a weight gain of more than three pounds in a single day or five pounds in a week, "
+                "contact your primary care doctor immediately as this indicates fluid buildup. Continue to "
+                "use your salt-restricted diet and limit your total fluid intake to one and a half liters "
+                "daily to prevent further strain on your heart. Rest is encouraged for the next week; however, "
+                "try to perform light walking around the house to prevent blood clots. Avoid any heavy lifting "
+                "or strenuous exercise until cleared by your cardiologist. You should continue your home "
+                "medications as updated in the attached list. Seek immediate emergency care if you experience "
+                "chest pain, severe shortness of breath while sitting still, or if you begin coughing up blood. "
+                "We have adjusted your diuretic medication slightly to help manage your fluid levels more "
+                "effectively during your recovery. Ensure you have picked up your new prescriptions from the "
+                "pharmacy before the end of the day. It is also recommended that you receive your flu and "
+                "pneumonia vaccinations once you have fully recovered from this current illness. Please bring "
+                "your updated medication list to all upcoming appointments to ensure your medical record is accurate. "
+                
+                "Followup Instructions: "
+                "Follow up with Cardiology next week. Follow up with your Primary Care Provider within seven days "
+                "for a transition of care visit."
+                        
+        )
+        patient = self.create_mock_patient(note)
+        samples = self.task(patient)
+        
+        self.assertEqual(len(samples), 1, "This summary should not be filtered out as its length more than 350.")
+
+    def test_edge_cases(self):
+        """Verify that summaries shorter than MIN_SUMMARY_LENGTH (350) are skipped."""
+        short_summary = "This summary is too short." # ~26 chars
+        note = (
+            #"Brief Hospital Course:\nStable.\n"
+            "Medications on Admission:\nNone.\n"
+            "Discharge Instructions:\n" + short_summary + "\n"
+            "Followup Instructions:\nNone."
+        )
+        patient = self.create_mock_patient(note)
+        samples = self.task(patient)
+         
+        self.assertEqual(len(samples), 0, "Should filter out samples with short summaries.")
+
+    def test_edge_cases_1(self):
+        short_summary = "This is a sample generated summary."
+        note = (
+            "Brief Hospital Course:\nStable.\n"
+            #"Medications on Admission:\nNone.\n"
+            #"Discharge Instructions:\n" + short_summary + "\n"
+            #"Followup Instructions:\nNone."
+            "This is a sample generated short summary that coes not contain all sections."
+        )
+
+        patient = self.create_mock_patient(note)
+        samples = self.task(patient)
+        
+        self.assertEqual(len(samples), 0, "Should filter out samples with short summaries.")
+
+    
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file