Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 108 additions & 45 deletions server/api/views/uploadFile/test_title.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,60 +4,75 @@
from . import title


def make_page_dict(blocks):
"""Helper to build a get_text("dict") return value from a simple list of blocks.
Each block is a list of (text, font_size) tuples representing spans.
"""
dict_blocks = []
for spans in blocks:
dict_blocks.append({
"type": 0,
"lines": [{
"spans": [{"text": text, "size": size} for text, size in spans]
}]
})
return {"blocks": dict_blocks}


def make_mock_doc(pages_data, metadata=None):
"""Build a mock fitz.Document.
pages_data: list of block lists, one per page. Each block is a list of (text, size) tuples.
"""
doc = MagicMock()
doc.metadata = metadata or {"title": None}
doc.__len__ = lambda self: len(pages_data)

mock_pages = []
for page_blocks in pages_data:
page = MagicMock()
page.get_text.return_value = make_page_dict(page_blocks)
mock_pages.append(page)

doc.__getitem__ = lambda self, idx: mock_pages[idx]
return doc


class TestGenerateTitle(unittest.TestCase):
def test_prefers_metadata_title_if_valid(self):
doc = MagicMock()
doc.metadata = {"title": "A Study Regarding The Efficacy of Drugs"}
self.assertEqual(
"A Study Regarding The Efficacy of Drugs", title.generate_title(doc))

def test_falls_back_to_first_page_text_if_metadata_title_is_empty(self):
doc = MagicMock()
doc.metadata = {"title": ""}
doc[0].get_text = MagicMock()

foo_block = [None] * 7
foo_block[4] = "foo"
foo_block[6] = 0

title_block = [None] * 7
title_block[4] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
title_block[6] = 0

bar_block = [None] * 7
bar_block[4] = "bar"
bar_block[6] = 0
doc[0].get_text.return_value = [foo_block, title_block, bar_block]

def test_falls_back_to_font_size_if_metadata_title_is_empty(self):
doc = make_mock_doc(
pages_data=[[
[("foo", 10.0)],
[("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia", 18.0)],
[("bar", 10.0)],
]],
metadata={"title": ""},
)
expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
self.assertEqual(expected_title, title.generate_title(doc))

def test_falls_back_to_first_page_text_if_metadata_title_does_not_match_regex(self):
doc = MagicMock()
doc.metadata = {"title": "abcd1234"}
doc[0].get_text = MagicMock()

foo_block = [None] * 7
foo_block[4] = "foo"
foo_block[6] = 0

title_block = [None] * 7
title_block[4] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
title_block[6] = 0

bar_block = [None] * 7
bar_block[4] = "bar"
bar_block[6] = 0
doc[0].get_text.return_value = [foo_block, title_block, bar_block]

def test_falls_back_to_font_size_if_metadata_title_does_not_match_regex(self):
doc = make_mock_doc(
pages_data=[[
[("foo", 10.0)],
[("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia", 18.0)],
[("bar", 10.0)],
]],
metadata={"title": "abcd1234"},
)
expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
self.assertEqual(expected_title, title.generate_title(doc))

@patch("api.views.uploadFile.title.openAIServices.openAI")
def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI):
doc = MagicMock()
doc.metadata = {"title": None}
doc[0].get_text.return_value = []
doc = make_mock_doc(
pages_data=[[]] # no blocks at all
)

mock_response = MagicMock()
mock_response.choices = [MagicMock()]
Expand All @@ -70,9 +85,7 @@ def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI):

@patch("api.views.uploadFile.title.openAIServices.openAI")
def test_strips_quotes_from_openai_title(self, mock_openAI):
doc = MagicMock()
doc.metadata = {"title": None}
doc[0].get_text.return_value = []
doc = make_mock_doc(pages_data=[[]])

mock_response = MagicMock()
mock_response.choices = [MagicMock()]
Expand All @@ -85,9 +98,7 @@ def test_strips_quotes_from_openai_title(self, mock_openAI):

@patch("api.views.uploadFile.title.openAIServices.openAI")
def test_truncates_long_openai_title(self, mock_openAI):
doc = MagicMock()
doc.metadata = {"title": None}
doc[0].get_text.return_value = []
doc = make_mock_doc(pages_data=[[]])

mock_response = MagicMock()
mock_response.choices = [MagicMock()]
Expand All @@ -98,3 +109,55 @@ def test_truncates_long_openai_title(self, mock_openAI):

# Ensure the title is truncated to fit the UploadFile model's title field (max_length=255), since OpenAI responses may exceed this limit
self.assertLessEqual(len(result), 255)

def test_font_size_joins_adjacent_spans_in_same_block(self):
"""A title split across multiple spans in the same block should be joined."""
doc = make_mock_doc(
pages_data=[[
[("Author Name", 10.0)],
[("Advances in Mood Disorder", 18.0), ("Pharmacotherapy", 18.0)],
[("Some journal info", 10.0)],
]],
)
result = title.extract_title_by_font_size(doc)
self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")

def test_font_size_ignores_short_spans(self):
"""Superscript markers and other tiny spans should be filtered out."""
doc = make_mock_doc(
pages_data=[[
[("Advances in Mood Disorder Pharmacotherapy", 18.0), ("*", 18.0)],
[("Author Name et al.", 10.0)],
]],
)
# The "*" span is < 2 chars, so it should be ignored; title is just the real text
result = title.extract_title_by_font_size(doc)
self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")

def test_font_size_returns_none_when_no_regex_match(self):
"""If the largest-font text doesn't match the title regex, return None."""
doc = make_mock_doc(
pages_data=[[
# Only 2 words — regex requires at least 3
[("Psychiatry Research", 18.0)],
[("Author Name et al.", 10.0)],
]],
)
result = title.extract_title_by_font_size(doc)
self.assertIsNone(result)

def test_font_size_finds_title_on_later_page(self):
"""Title on page 2 should still be found if it has the largest font."""
doc = make_mock_doc(
pages_data=[
[ # page 1: cover page with smaller text
[("Some preamble text here", 12.0)],
],
[ # page 2: actual title in larger font
[("Advances in Mood Disorder Pharmacotherapy", 18.0)],
[("Author Name et al.", 10.0)],
],
],
)
result = title.extract_title_by_font_size(doc)
self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")
95 changes: 70 additions & 25 deletions server/api/views/uploadFile/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,44 +6,89 @@


# regular expression to match common research white paper titles. Created by Chat-gpt
# requires at least 3 words, no dates, no version numbers.
# requires at least 3 words, no version numbers.
title_regex = re.compile(
r'^(?=(?:\b\w+\b[\s:,\-\(\)]*){3,})(?!.*\b(?:19|20)\d{2}\b)(?!.*\bv\d+\b)[A-Za-z0-9][\w\s:,\-\(\)]*[A-Za-z\)]$', re.IGNORECASE)
r"^(?=(?:\b\w+\b[^A-Za-z0-9]*){3,})(?!.*\bv\d+\b)[A-Za-z0-9].+[A-Za-z\)?!]$", re.IGNORECASE)


def generate_title(pdf: fitz.Document) -> str | None:
document_metadata_title = pdf.metadata["title"]
if document_metadata_title is not None and document_metadata_title != "":
if title_regex.match(document_metadata_title):
print("suitable title was found in metadata")
return document_metadata_title.strip()
else:
print("metadata title did not match regex")

print("Looking for title in first page text")
first_page = pdf[0]
first_page_blocks = first_page.get_text("blocks")
text_blocks = [
block[4].strip().replace("\n", " ")
for block in first_page_blocks
if block[6] == 0 # only include text blocks.
]

# For some reason, extracted PDF text has extra spaces. Collapse them here.
regex = r"\s{2,}"
text_blocks = [re.sub(regex, " ", text) for text in text_blocks]

if len(text_blocks) != 0:
for text in text_blocks:
if title_regex.match(text):
return text

print(
"no suitable title found in first page text. Using GPT-4 to summarize the PDF")
font_title = extract_title_by_font_size(pdf)
if font_title:
return font_title

gpt_title = summarize_pdf(pdf)
return gpt_title or None


def extract_title_by_font_size(pdf: fitz.Document, max_pages: int = 3) -> str | None:
"""
Extract the title by finding the largest font size across the first few pages
and collecting contiguous runs of text at that size.
"""
pages_to_scan = min(max_pages, len(pdf))

# First pass: collect all spans with their font size, and find the max font size.
all_spans = []
max_font_size = 0.0

for page_idx in range(pages_to_scan):
page_dict = pdf[page_idx].get_text("dict")
for block in page_dict["blocks"]:
if block.get("type") != 0:
continue
for line in block["lines"]:
for span in line["spans"]:
text = span["text"].strip()
size = span["size"]
if len(text) < 2 or size < 6.0:
continue
all_spans.append({"text": text, "size": size})
if size > max_font_size:
max_font_size = size

if max_font_size == 0.0:
return None

# Second pass: gather contiguous runs of spans at the max font size.
# Runs continue across block boundaries so multi-block titles (e.g.,
# "BIPOLAR DISORDER IN PRIMARY CARE:" in one block and "DIAGNOSIS AND
# MANAGEMENT" in the next) are joined into a single candidate.
# A run only ends when a non-max-size span interrupts it.
candidates = []
current_run = []

for span in all_spans:
if span["size"] == max_font_size:
current_run.append(span["text"])
else:
if current_run:
candidates.append(" ".join(current_run))
current_run = []

if current_run:
candidates.append(" ".join(current_run))

# Collapse extra whitespace, validate against title regex, and pick the longest match.
# Longest wins because real titles are typically longer than section headers
# (e.g., "About the Author") that may share the same max font size.
best = None
for candidate in candidates:
cleaned = re.sub(r"\s{2,}", " ", candidate).strip()
if title_regex.match(cleaned):
if best is None or len(cleaned) > len(best):
best = cleaned

if best:
return best[:255]

return None


def summarize_pdf(pdf: fitz.Document) -> str:
"""
Summarize a PDF document using OpenAI's GPT-4 model.
Expand Down
Loading