diff --git a/src/memos/mem_reader/multi_modal_struct.py b/src/memos/mem_reader/multi_modal_struct.py index 0b3e19208..679cb711d 100644 --- a/src/memos/mem_reader/multi_modal_struct.py +++ b/src/memos/mem_reader/multi_modal_struct.py @@ -807,7 +807,7 @@ def _process_one_item( if result: fine_memory_items.extend(result) except Exception as e: - logger.error(f"[MultiModalFine] worker error: {e}") + logger.error(f"[MultiModalFine] worker error: {e} {traceback.format_exc()}") # related preceding and following rawfilememories fine_memory_items = self._relate_preceding_following_rawfile_memories(fine_memory_items) diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index c8f944e41..b440a4aef 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -367,7 +367,7 @@ def create_source( ) -> SourceMessage: """Create SourceMessage from file content part.""" if isinstance(message, dict): - file_info = message.get("file", {}) + file_info = message.get("file", {}) or {} source_dict = { "type": "file", "doc_path": file_info.get("filename") or file_info.get("file_id", ""), diff --git a/src/memos/mem_reader/read_multi_modal/utils.py b/src/memos/mem_reader/read_multi_modal/utils.py index 96918589b..2e6ab1d6e 100644 --- a/src/memos/mem_reader/read_multi_modal/utils.py +++ b/src/memos/mem_reader/read_multi_modal/utils.py @@ -45,6 +45,10 @@ ) +KEYS_DROP_LABEL = r"(text|type|image_url|imageurl|url|file|file_id|image_id|file_data)" +ID_KEYS_DROP_VALUE = r"(file_id|image_id)" + + def parse_json_result(response_text: str) -> dict: """ Parse JSON result from LLM response. @@ -356,13 +360,25 @@ def detect_lang(text): cleaned_text = re.sub(r"\[[\d\-:\s]+\]", "", cleaned_text) # remove URLs to prevent the dilution of Chinese characters cleaned_text = re.sub(r'https?://[^\s<>"{}|\\^`\[\]]+', "", cleaned_text) - # remove MessageType schema keywords (multimodal JSON noise) + # remove common id-like tokens (uuid-ish / file_id / image_id / + # my_id_01 etc.) + # uuid + cleaned_text = re.sub( + r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b", + " ", + cleaned_text, + flags=re.IGNORECASE, + ) + # key:value where key ends with _id or is id, and value is quoted or bare token + cleaned_text = re.sub( + r'(?i)\b[a-z_]*id\b\s*[:=]\s*(".*?"|\'.*?\'|[a-z0-9_\-]+)', " ", cleaned_text + ) cleaned_text = re.sub( - r"\b(text|type|image_url|imageurl|url)\b", "", cleaned_text, flags=re.IGNORECASE + r'(?i)\b[a-z_]*_id\b\s*[:=]\s*(".*?"|\'.*?\'|[a-z0-9_\-]+)', " ", cleaned_text ) # remove schema keywords like text / type / image_url / url cleaned_text = re.sub( - r"\b(text|type|image_url|imageurl|url|file|file_id)\b", + r"\b(text|type|image_url|imageurl|url|file|file_id|image_id|file_data)\b", "", cleaned_text, flags=re.IGNORECASE,