diff --git a/server/api/views/uploadFile/test_title.py b/server/api/views/uploadFile/test_title.py index 0ec9e1bc..531a3f07 100644 --- a/server/api/views/uploadFile/test_title.py +++ b/server/api/views/uploadFile/test_title.py @@ -4,6 +4,39 @@ from . import title +def make_page_dict(blocks): + """Helper to build a get_text("dict") return value from a simple list of blocks. + Each block is a list of (text, font_size) tuples representing spans. + """ + dict_blocks = [] + for spans in blocks: + dict_blocks.append({ + "type": 0, + "lines": [{ + "spans": [{"text": text, "size": size} for text, size in spans] + }] + }) + return {"blocks": dict_blocks} + + +def make_mock_doc(pages_data, metadata=None): + """Build a mock fitz.Document. + pages_data: list of block lists, one per page. Each block is a list of (text, size) tuples. + """ + doc = MagicMock() + doc.metadata = metadata or {"title": None} + doc.__len__ = lambda self: len(pages_data) + + mock_pages = [] + for page_blocks in pages_data: + page = MagicMock() + page.get_text.return_value = make_page_dict(page_blocks) + mock_pages.append(page) + + doc.__getitem__ = lambda self, idx: mock_pages[idx] + return doc + + class TestGenerateTitle(unittest.TestCase): def test_prefers_metadata_title_if_valid(self): doc = MagicMock() @@ -11,53 +44,35 @@ def test_prefers_metadata_title_if_valid(self): self.assertEqual( "A Study Regarding The Efficacy of Drugs", title.generate_title(doc)) - def test_falls_back_to_first_page_text_if_metadata_title_is_empty(self): - doc = MagicMock() - doc.metadata = {"title": ""} - doc[0].get_text = MagicMock() - - foo_block = [None] * 7 - foo_block[4] = "foo" - foo_block[6] = 0 - - title_block = [None] * 7 - title_block[4] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia" - title_block[6] = 0 - - bar_block = [None] * 7 - bar_block[4] = "bar" - bar_block[6] = 0 - doc[0].get_text.return_value = [foo_block, title_block, bar_block] - + def test_falls_back_to_font_size_if_metadata_title_is_empty(self): + doc = make_mock_doc( + pages_data=[[ + [("foo", 10.0)], + [("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia", 18.0)], + [("bar", 10.0)], + ]], + metadata={"title": ""}, + ) expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia" self.assertEqual(expected_title, title.generate_title(doc)) - def test_falls_back_to_first_page_text_if_metadata_title_does_not_match_regex(self): - doc = MagicMock() - doc.metadata = {"title": "abcd1234"} - doc[0].get_text = MagicMock() - - foo_block = [None] * 7 - foo_block[4] = "foo" - foo_block[6] = 0 - - title_block = [None] * 7 - title_block[4] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia" - title_block[6] = 0 - - bar_block = [None] * 7 - bar_block[4] = "bar" - bar_block[6] = 0 - doc[0].get_text.return_value = [foo_block, title_block, bar_block] - + def test_falls_back_to_font_size_if_metadata_title_does_not_match_regex(self): + doc = make_mock_doc( + pages_data=[[ + [("foo", 10.0)], + [("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia", 18.0)], + [("bar", 10.0)], + ]], + metadata={"title": "abcd1234"}, + ) expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia" self.assertEqual(expected_title, title.generate_title(doc)) @patch("api.views.uploadFile.title.openAIServices.openAI") def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI): - doc = MagicMock() - doc.metadata = {"title": None} - doc[0].get_text.return_value = [] + doc = make_mock_doc( + pages_data=[[]] # no blocks at all + ) mock_response = MagicMock() mock_response.choices = [MagicMock()] @@ -70,9 +85,7 @@ def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI): @patch("api.views.uploadFile.title.openAIServices.openAI") def test_strips_quotes_from_openai_title(self, mock_openAI): - doc = MagicMock() - doc.metadata = {"title": None} - doc[0].get_text.return_value = [] + doc = make_mock_doc(pages_data=[[]]) mock_response = MagicMock() mock_response.choices = [MagicMock()] @@ -85,9 +98,7 @@ def test_strips_quotes_from_openai_title(self, mock_openAI): @patch("api.views.uploadFile.title.openAIServices.openAI") def test_truncates_long_openai_title(self, mock_openAI): - doc = MagicMock() - doc.metadata = {"title": None} - doc[0].get_text.return_value = [] + doc = make_mock_doc(pages_data=[[]]) mock_response = MagicMock() mock_response.choices = [MagicMock()] @@ -98,3 +109,55 @@ def test_truncates_long_openai_title(self, mock_openAI): # Ensure the title is truncated to fit the UploadFile model's title field (max_length=255), since OpenAI responses may exceed this limit self.assertLessEqual(len(result), 255) + + def test_font_size_joins_adjacent_spans_in_same_block(self): + """A title split across multiple spans in the same block should be joined.""" + doc = make_mock_doc( + pages_data=[[ + [("Author Name", 10.0)], + [("Advances in Mood Disorder", 18.0), ("Pharmacotherapy", 18.0)], + [("Some journal info", 10.0)], + ]], + ) + result = title.extract_title_by_font_size(doc) + self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy") + + def test_font_size_ignores_short_spans(self): + """Superscript markers and other tiny spans should be filtered out.""" + doc = make_mock_doc( + pages_data=[[ + [("Advances in Mood Disorder Pharmacotherapy", 18.0), ("*", 18.0)], + [("Author Name et al.", 10.0)], + ]], + ) + # The "*" span is < 2 chars, so it should be ignored; title is just the real text + result = title.extract_title_by_font_size(doc) + self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy") + + def test_font_size_returns_none_when_no_regex_match(self): + """If the largest-font text doesn't match the title regex, return None.""" + doc = make_mock_doc( + pages_data=[[ + # Only 2 words — regex requires at least 3 + [("Psychiatry Research", 18.0)], + [("Author Name et al.", 10.0)], + ]], + ) + result = title.extract_title_by_font_size(doc) + self.assertIsNone(result) + + def test_font_size_finds_title_on_later_page(self): + """Title on page 2 should still be found if it has the largest font.""" + doc = make_mock_doc( + pages_data=[ + [ # page 1: cover page with smaller text + [("Some preamble text here", 12.0)], + ], + [ # page 2: actual title in larger font + [("Advances in Mood Disorder Pharmacotherapy", 18.0)], + [("Author Name et al.", 10.0)], + ], + ], + ) + result = title.extract_title_by_font_size(doc) + self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy") diff --git a/server/api/views/uploadFile/title.py b/server/api/views/uploadFile/title.py index 17f52a74..f4e562c1 100644 --- a/server/api/views/uploadFile/title.py +++ b/server/api/views/uploadFile/title.py @@ -6,44 +6,89 @@ # regular expression to match common research white paper titles. Created by Chat-gpt -# requires at least 3 words, no dates, no version numbers. +# requires at least 3 words, no version numbers. title_regex = re.compile( - r'^(?=(?:\b\w+\b[\s:,\-\(\)]*){3,})(?!.*\b(?:19|20)\d{2}\b)(?!.*\bv\d+\b)[A-Za-z0-9][\w\s:,\-\(\)]*[A-Za-z\)]$', re.IGNORECASE) + r"^(?=(?:\b\w+\b[^A-Za-z0-9]*){3,})(?!.*\bv\d+\b)[A-Za-z0-9].+[A-Za-z\)?!]$", re.IGNORECASE) def generate_title(pdf: fitz.Document) -> str | None: document_metadata_title = pdf.metadata["title"] if document_metadata_title is not None and document_metadata_title != "": if title_regex.match(document_metadata_title): - print("suitable title was found in metadata") return document_metadata_title.strip() - else: - print("metadata title did not match regex") - print("Looking for title in first page text") - first_page = pdf[0] - first_page_blocks = first_page.get_text("blocks") - text_blocks = [ - block[4].strip().replace("\n", " ") - for block in first_page_blocks - if block[6] == 0 # only include text blocks. - ] - - # For some reason, extracted PDF text has extra spaces. Collapse them here. - regex = r"\s{2,}" - text_blocks = [re.sub(regex, " ", text) for text in text_blocks] - - if len(text_blocks) != 0: - for text in text_blocks: - if title_regex.match(text): - return text - - print( - "no suitable title found in first page text. Using GPT-4 to summarize the PDF") + font_title = extract_title_by_font_size(pdf) + if font_title: + return font_title + gpt_title = summarize_pdf(pdf) return gpt_title or None +def extract_title_by_font_size(pdf: fitz.Document, max_pages: int = 3) -> str | None: + """ + Extract the title by finding the largest font size across the first few pages + and collecting contiguous runs of text at that size. + """ + pages_to_scan = min(max_pages, len(pdf)) + + # First pass: collect all spans with their font size, and find the max font size. + all_spans = [] + max_font_size = 0.0 + + for page_idx in range(pages_to_scan): + page_dict = pdf[page_idx].get_text("dict") + for block in page_dict["blocks"]: + if block.get("type") != 0: + continue + for line in block["lines"]: + for span in line["spans"]: + text = span["text"].strip() + size = span["size"] + if len(text) < 2 or size < 6.0: + continue + all_spans.append({"text": text, "size": size}) + if size > max_font_size: + max_font_size = size + + if max_font_size == 0.0: + return None + + # Second pass: gather contiguous runs of spans at the max font size. + # Runs continue across block boundaries so multi-block titles (e.g., + # "BIPOLAR DISORDER IN PRIMARY CARE:" in one block and "DIAGNOSIS AND + # MANAGEMENT" in the next) are joined into a single candidate. + # A run only ends when a non-max-size span interrupts it. + candidates = [] + current_run = [] + + for span in all_spans: + if span["size"] == max_font_size: + current_run.append(span["text"]) + else: + if current_run: + candidates.append(" ".join(current_run)) + current_run = [] + + if current_run: + candidates.append(" ".join(current_run)) + + # Collapse extra whitespace, validate against title regex, and pick the longest match. + # Longest wins because real titles are typically longer than section headers + # (e.g., "About the Author") that may share the same max font size. + best = None + for candidate in candidates: + cleaned = re.sub(r"\s{2,}", " ", candidate).strip() + if title_regex.match(cleaned): + if best is None or len(cleaned) > len(best): + best = cleaned + + if best: + return best[:255] + + return None + + def summarize_pdf(pdf: fitz.Document) -> str: """ Summarize a PDF document using OpenAI's GPT-4 model.