fix: parse in sim-struct chat-data; unify image analysis prompt with context injection & improve LLM robustness #913

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

CaralHsi merged 4 commits into MemTensor:dev-20260119-v2.0.3 from CaralHsi:feat/fig-with-context

Jan 21, 2026

src/memos/mem_reader/read_multi_modal/image_parser.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -151,6 +151,17 @@ def parse_fine( @@
                 IMAGE_ANALYSIS_PROMPT_ZH if lang == "zh" else IMAGE_ANALYSIS_PROMPT_EN
             )
+            # Add context if available
+            context_text = ""
+            if context_items:
+                for item in context_items:
+                    if hasattr(item, "memory") and item.memory:
+                        context_text += f"{item.memory}\n"
+            context_text = context_text.strip()
+            # Inject context into prompt when possible
+            image_analysis_prompt = image_analysis_prompt.replace("{context}", context_text)
             # Build messages with image content
             messages = [
                 {
@@ Expand All / @@ -168,21 +179,6 @@ def parse_fine( @@
                 }
             ]
-            # Add context if available
-            if context_items:
-                context_text = ""
-                for item in context_items:
-                    if hasattr(item, "memory") and item.memory:
-                        context_text += f"{item.memory}\n"
-                if context_text:
-                    messages.insert(
-,
-                        {
-                            "role": "system",
-                            "content": f"Context from previous conversation:\n{context_text}",
-                        },
-                    )
             try:
                 # Call LLM with vision model
                 response_text = self.llm.generate(messages)
@@ Expand All / @@ -192,6 +188,9 @@ def parse_fine( @@
                 # Parse JSON response
                 response_json = self._parse_json_result(response_text)
+                if not response_json:
+                    logger.warning(f"[ImageParser] Fail to parse response from LLM: {response_text}")
+                    return []
                 # Extract memory items from response
                 memory_items = []
@@ Expand Down Expand Up / @@ -323,8 +322,7 @@ def _cheap_close(t: str) -> str: @@
                         return json.loads(s)
                     except json.JSONDecodeError:
                         pass
-                logger.error(f"[ImageParser] Failed to parse JSON: {e}\nResponse: {response_text}")
-                return {}
+                logger.warning(f"[ImageParser] Failed to parse JSON: {e}\nResponse: {response_text}")
         def _create_memory_item(
             self,
@@ Expand Down @@

src/memos/mem_reader/simple_struct.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -224,6 +224,22 @@ def _make_memory_item(
  
                ),

            )

        def _safe_generate(self, messages: list[dict]) -> str | None:

            try:

                return self.llm.generate(messages)

            except Exception:

                logger.exception("[LLM] Generation failed")

                return None

        def _safe_parse(self, text: str | None) -> dict | None:

            if not text:

                return None

            try:

                return parse_json_result(text)

            except Exception:

                logger.warning("[LLM] JSON parse failed")

                return None

        def _get_llm_response(self, mem_str: str, custom_tags: list[str] | None) -> dict:

            lang = detect_lang(mem_str)

            template = PROMPT_DICT["chat"][lang]

    @@ -240,13 +256,13 @@ def _get_llm_response(self, mem_str: str, custom_tags: list[str] | None) -> dict
  
            if self.config.remove_prompt_example:

                prompt = prompt.replace(examples, "")

            messages = [{"role": "user", "content": prompt}]

            try:

                response_text = self.llm.generate(messages)

                response_json = parse_json_result(response_text)

            except Exception as e:

                logger.error(f"[LLM] Exception during chat generation: {e}")

                response_json = {

                    "memory list": [

            response_text = self._safe_generate(messages)

            response_json = self._safe_parse(response_text)

            if not response_json:

                return {

                    "memory_list": [

                        {

                            "key": mem_str[:10],

                            "memory_type": "UserMemory",

    @@ -256,6 +272,7 @@ def _get_llm_response(self, mem_str: str, custom_tags: list[str] | None) -> dict
  
                    ],

                    "summary": mem_str,

                }

            return response_json

        def _iter_chat_windows(self, scene_data_info, max_tokens=None, overlap=200):

src/memos/mem_reader/utils.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -70,7 +70,7 @@ def _cheap_close(t: str) -> str: @@
             if "Invalid \\escape" in str(e):
                 s = s.replace("\\", "\\\\")
                 return json.loads(s)
-            logger.error(
+            logger.warning(
                 f"[JSONParse] Failed to decode JSON: {e}\nTail: Raw {response_text} \
                 json: {s}"
             )
@@ Expand Down @@

src/memos/templates/mem_reader_prompts.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -604,13 +604,12 @@
  
    你可以选择与memory相关的在上述列表中可以加入tags，同时你可以根据memory的内容自由添加tags。"""

    IMAGE_ANALYSIS_PROMPT_EN = """You are an intelligent memory assistant. Analyze the provided image and extract meaningful information that should be remembered.

    IMAGE_ANALYSIS_PROMPT_EN = """You are an intelligent memory assistant. Please analyze the provided image based on the contextual information (if any) and extract meaningful information that should be remembered.

    Please extract:

    1. **Visual Content**: What objects, people, scenes, or text are visible in the image?

    2. **Context**: What is the context or situation depicted?

    3. **Key Information**: What important details, facts, or information can be extracted?

    4. **User Relevance**: What aspects of this image might be relevant to the user's memory?

    2. **Key Information**: What important details, facts, or information can be extracted?

    3. **User Relevance**: What aspects of this image might be relevant to the user's memory?

    Return a valid JSON object with the following structure:

    {

    @@ -630,16 +629,44 @@
  
    - The `key`, `value`, `tags`, `summary` and `memory_type` fields should match the language of the user's context if available, otherwise use English.

    - Keep `memory_type` in English.

    Example:

    Reference context:

    role-user: I plan to carry this for hiking at Mount Siguniang

    role-Bob: Me too

    Image URL to be analyzed: https://xxxxxx.jpg

    {

      "memory list": [

        {

          "key": "Cylindrical Carry-On Item Attached to Hiking Backpack",

          "memory_type": "LongTermMemory",

          "value": "An outdoor hiking backpack has a black cylindrical carry-on item secured to its side with webbing straps. The cylinder is positioned vertically, with a length close to the height of the backpack’s side pocket. The exterior is dark-colored with a textured or perforated surface, clearly designed for outdoor use and convenient access while walking.",

          "tags": ["outdoor", "hiking", "backpack", "side-mounted", "carry-on item"]

        },

        {

          "key": "Mount Siguniang Hiking Equipment Plan",

          "memory_type": "UserMemory",

          "value": "Both the user and Bob explicitly plan to carry this outdoor backpack during their hiking trip to Mount Siguniang, indicating that this carrying setup has been included in their preparation for a high-altitude hiking journey.",

          "tags": ["user plan", "Mount Siguniang", "hiking", "trekking trip"]

        }

      ],

      "summary": "The image presents a typical hiking setup in an outdoor context. A hiking or travel backpack has a black cylindrical carry-on item attached to its side, suggesting a lightweight and practical configuration for long-distance walking. The overall visual tone emphasizes mobility and convenience. The accompanying text highlights ease of travel, no installation required, and suitability for carrying while on the move. Clear specifications for the cylindrical item are also shown, including its width (approximately 2.56 inches), height (approximately 9.76 inches), and net weight (about 1.45 pounds), underscoring its compact size and manageable weight. Combined with the provided context, this setup is planned for a hiking trip to Mount Siguniang, giving the image a clear personal usage scenario and long-term memory relevance."

    }

    If context is provided, incorporate it into the extraction. If no context is given, extract only the key information from the image.

    Reference context:

    {context}

    Focus on extracting factual, observable information from the image. Avoid speculation unless clearly relevant to user memory."""

    IMAGE_ANALYSIS_PROMPT_ZH = """您是一个智能记忆助手。请分析提供的图像并提取应该被记住的有意义信息。

    IMAGE_ANALYSIS_PROMPT_ZH = """您是一个智能记忆助手。请根据上下文信息（如有）分析提供的图像并提取应该被记住的有意义信息。

    请提取：

    1. **视觉内容**：图像中可见的物体、人物、场景或文字是什么？

    2. **上下文**：图像描绘了什么情境或情况？

    3. **关键信息**：可以提取哪些重要的细节、事实或信息？

    4. **用户相关性**：图像的哪些方面可能与用户的记忆相关？

    2. **关键信息**：可以提取哪些重要的细节、事实或信息？

    3. **用户相关性**：图像的哪些方面可能与用户的记忆相关？

    返回一个有效的 JSON 对象，格式如下：

    {

    @@ -659,7 +686,36 @@
  
    - `key`、`value`、`tags`、`summary` 和 `memory_type` 字段应该与用户上下文的语言匹配（如果可用），否则使用中文。

    - `memory_type` 保持英文。

    专注于从图像中提取事实性、可观察的信息。除非与用户记忆明显相关，否则避免推测。"""

    例子：

    参考的上下文：

    role-user: 我打算背这个去四姑娘山徒步

    role-bob: 我也是

    待解析的url：https://xxxxxx.jpg

    {

      "memory list": [

        {

          "key": "徒步背包侧挂圆柱形随行物品",

          "memory_type": "LongTermMemory",

          "value": "一只户外徒步背包侧面通过织带固定了一件黑色圆柱形随行物品。圆柱体纵向放置，长度接近背包侧袋高度，外壳为深色并带有防滑或透气纹理，整体外观明显为户外使用设计，方便在行走过程中快速取放。",

          "tags": ["户外", "徒步", "背包", "侧挂", "随行物品"]

        },

        {

          "key": "四姑娘山徒步随身装备计划",

          "memory_type": "UserMemory",

          "value": "用户和Bob明确计划在四姑娘山徒步行程中背负该款户外背包，说明这套背负方式已被纳入他们高海拔徒步行程的装备准备中。",

          "tags": ["用户计划", "四姑娘山", "徒步", "登山行程"]

        }

      ],

      "summary": "画面展示了一种典型的徒步出行配置：一只登山或旅行背包侧边固定着一件黑色圆柱形随行物品，整体氛围明显指向户外行走和轻量化携带场景。画面中的文字强调轻便、无需安装、适合随身携带的使用理念，并直接给出了随行物品的尺寸与重量信息（宽度约2.56英寸、高度约9.76英寸、净重约1.45磅），突出了便于背负和长时间携行的特点。结合用户给出的背景，这套装备被计划用于四姑娘山徒步，具备清晰的个人使用情境和长期记忆价值。"

    }

    如果给定了上下文，就结合上下文信息进行提取，如果没有给定上下文，请直接提取图片的关键信息。

    参考的上下文：

    {context}

    专注于从图像中提取事实性、可观察的信息。除非与用户记忆明显相关，否则避免推测。

    """

    SIMPLE_STRUCT_REWRITE_MEMORY_PROMPT_BACKUP = """

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

fix: parse in sim-struct chat-data; unify image analysis prompt with context injection & improve LLM robustness #913

Uh oh!

Diff view

Diff view

There are no files selected for viewing