From d8f1595a35dc48078390d6c68297a0b1adcde0da Mon Sep 17 00:00:00 2001 From: Badiboy Date: Sat, 14 Feb 2026 19:19:06 +0300 Subject: [PATCH 1/5] New (alternative) message entity parsers Added 3 test versions of message entity parsers (apply_html_entities processor). --- telebot/formatting.py | 565 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 560 insertions(+), 5 deletions(-) diff --git a/telebot/formatting.py b/telebot/formatting.py index 0e300f7c1..343c1be78 100644 --- a/telebot/formatting.py +++ b/telebot/formatting.py @@ -1,12 +1,18 @@ """ Markdown & HTML formatting functions. - -.. versionadded:: 4.5.1 """ import re import html -from typing import Optional, List, Dict +from typing import Optional, List, Dict, Tuple + + +# Alternative message entities parsers. Can be: +# "deepseek" - deepseek version +# "gemini" - gemini version +# "chatgpt" - chatgpt version +# other values - original version +ENTITY_PASER_MODE = None def format_text(*args, separator="\n"): @@ -392,6 +398,12 @@ def apply_html_entities(text: str, entities: Optional[List], custom_subs: Option ) >> "Test parse formatting, url and text_mention and mention @username" """ + if ENTITY_PASER_MODE == "deepseek": + return apply_html_entities_ds(text, entities, custom_subs) + elif ENTITY_PASER_MODE == "gemini": + return apply_html_entities_gm(text, entities, custom_subs) + elif ENTITY_PASER_MODE == "chatgpt": + return apply_html_entities_cg(text, entities, custom_subs) if not entities: return text.replace("&", "&").replace("<", "<").replace(">", ">") @@ -401,7 +413,6 @@ def apply_html_entities(text: str, entities: Optional[List], custom_subs: Option "italic": "{text}", "pre": "
{text}
", "code": "{text}", - # "url": "{text}", # @badiboy plain URLs have no text and do not need tags "text_link": "{text}", "strikethrough": "{text}", "underline": "{text}", @@ -409,7 +420,6 @@ def apply_html_entities(text: str, entities: Optional[List], custom_subs: Option "custom_emoji": "{text}", "blockquote": "
{text}
", "expandable_blockquote": "
{text}
", - } if custom_subs: @@ -477,3 +487,548 @@ def func(upd_text, subst_type=None, url=None, user=None, custom_emoji_id=None, l html_text += func(utf16_text[offset * 2:]) return html_text + + +#region DeepSeek vibecoding here +class EntityProcessor: + """ + Handles parsing of text with message entities to HTML. + """ + + # Entity type to HTML template mapping + ENTITY_TEMPLATES = { + "bold": "{text}", + "italic": "{text}", + "pre": "
{text}
", + "code": "{text}", + "text_link": "{text}", + "strikethrough": "{text}", + "underline": "{text}", + "spoiler": "{text}", + "custom_emoji": "{text}", + "blockquote": "
{text}
", + "expandable_blockquote": "
{text}
", + } + + def __init__(self, text: str, custom_subs: Optional[Dict[str, str]] = None): + self.text = text + self.utf16_mapping, self.char_to_units = self.utf16_code_units_to_indices(text) + self.total_utf16_units = len(self.utf16_mapping) + self.custom_subs = custom_subs + + def check_entity_exists(self, entity_type: str) -> bool: + """ + Check if an entity type has a defined HTML template, considering custom substitutions. + """ + return (entity_type in self.ENTITY_TEMPLATES) or (self.custom_subs and (entity_type in self.custom_subs)) + + def get_entity_template(self, entity_type: str, default: Optional[str] = None) -> Optional[str]: + """ + Get the HTML template for a given entity type, considering custom substitutions. + """ + if entity_type in self.ENTITY_TEMPLATES: + return self.ENTITY_TEMPLATES[entity_type] + elif self.custom_subs and (entity_type in self.custom_subs): + return self.custom_subs[entity_type] + else: + return default + + @staticmethod + def utf16_code_units_to_indices(text: str) -> Tuple[List[int], List[int]]: + """ + Convert UTF-16 code unit positions to Python string indices. + + Returns: + - code_unit_to_char_idx: Mapping from UTF-16 code unit position to character index + - char_idx_to_code_units: Number of UTF-16 code units per character + """ + code_unit_to_char_idx = [] + char_idx_to_code_units = [] + + code_unit_pos = 0 + for char_idx, char in enumerate(text): + code_point = ord(char) + # Characters outside BMP (U+10000 to U+10FFFF) use 2 UTF-16 code units + if code_point >= 0x10000: + code_units = 2 + else: + code_units = 1 + + # Map this code unit position to character index + for _ in range(code_units): + code_unit_to_char_idx.append(char_idx) + + char_idx_to_code_units.append(code_units) + code_unit_pos += code_units + + return code_unit_to_char_idx, char_idx_to_code_units + + def utf16_to_char_index(self, utf16_pos: int) -> int: + """ + Convert UTF-16 code unit position to character index. + """ + if utf16_pos >= len(self.utf16_mapping): + return len(self.text) + return self.utf16_mapping[utf16_pos] + + def get_entity_text(self, entity) -> str: # entity: MessageEntity + """ + Extract the text for an entity using UTF-16 code unit offsets. + """ + start_char = self.utf16_to_char_index(entity.offset) + end_char = self.utf16_to_char_index(entity.offset + entity.length) + return self.text[start_char:end_char] + + def create_html_tag(self, entity, content: str) -> str: # entity: MessageEntity + """ + Create HTML tag for an entity with the given content. + """ + entity_type = entity.type + + # if entity_type in self.ENTITY_TEMPLATES: + # template = self.ENTITY_TEMPLATES[entity_type] + # elif self.custom_subs and (entity_type in self.custom_subs): + # template = self.custom_subs[entity_type] + # else: + # # If no template is defined for this entity type, return the content as is + # return content + template = self.get_entity_template(entity_type) + if not template: + return content + + # Prepare format arguments + format_args = {"text": content} + if entity_type == "text_link": + format_args["url"] = entity.url or "" + elif entity_type == "custom_emoji": + format_args["custom_emoji_id"] = entity.custom_emoji_id or "" + + return template.format(**format_args) + +def apply_html_entities_ds(text: str, entities: Optional[List], # entities: Optional[List[MessageEntity]] + custom_subs: Optional[Dict[str, str]] = None) -> str: + """ + Parse text message to HTML code according to message entities. + Properly handles UTF-16 code units for offsets and nested entities. + + Args: + text: Plain text message + entities: List of MessageEntity objects + custom_subs: Optional custom substitutions (not used in this implementation) + + Returns: + HTML formatted string + """ + if not text: + return text + elif not entities: + return text.replace("&", "&").replace("<", "<").replace(">", ">") + + processor = EntityProcessor(text) + + # Sort entities by their position in the text + # For proper nesting handling, we need to process from the end + sorted_entities = sorted(entities, key=lambda e: e.offset, reverse=True) + + # Build a tree structure of entities + # First, convert UTF-16 offsets to character indices for easier processing + entity_ranges = [] + for entity in sorted_entities: + # if entity.type in processor.ENTITY_TEMPLATES: + # pass + # elif processor.custom_subs and (entity.type in processor.custom_subs): + # pass + # else: + # continue + if not processor.check_entity_exists(entity.type): + continue + + start_char = processor.utf16_to_char_index(entity.offset) + end_char = processor.utf16_to_char_index(entity.offset + entity.length) + + entity_ranges.append({ + 'entity': entity, + 'start': start_char, + 'end': end_char, + 'type': entity.type, + 'processed': False + }) + + # Sort by start position (ascending) and then by length (descending) + # This ensures parent entities come before children + entity_ranges.sort(key=lambda x: (x['start'], -x['end'])) + + # Build the HTML recursively + def process_range(start_idx: int, end_idx: int, entities_in_range: List[dict]) -> str: + """ + Recursively process a text range with its entities. + """ + if not entities_in_range: + return text[start_idx:end_idx] + + # Group entities by their start position + result_parts = [] + current_pos = start_idx + + # Sort entities by their start position + entities_in_range.sort(key=lambda x: x['start']) + + i = 0 + while i < len(entities_in_range): + entity = entities_in_range[i] + + # Add text before this entity + if entity['start'] > current_pos: + result_parts.append(text[current_pos:entity['start']]) + + # Find all entities that start at the same position or are nested within + nested_entities = [] + j = i + while j < len(entities_in_range) and entities_in_range[j]['start'] < entity['end']: + if entities_in_range[j]['start'] >= entity['start']: + nested_entities.append(entities_in_range[j]) + j += 1 + + # Filter entities that are actually within this entity's range + nested_entities = [e for e in nested_entities if + e['start'] >= entity['start'] and e['end'] <= entity['end']] + + # Process the content of this entity (including nested entities) + content = process_range(entity['start'], entity['end'], + [e for e in nested_entities if e != entity]) + + # Apply this entity's HTML tag + html_content = processor.create_html_tag(entity['entity'], content) + result_parts.append(html_content) + + # Move current position to the end of this entity + current_pos = entity['end'] + i = j + + # Add remaining text + if current_pos < end_idx: + result_parts.append(text[current_pos:end_idx]) + + return ''.join(result_parts) + + # Process the entire text + return process_range(0, len(text), entity_ranges) +#endregion + +#region Gemini vibecoding here +# import sys +# import html +# from typing import List, Optional, Dict + +# 2. Main Function +def apply_html_entities_gm( + text: str, + entities: Optional[List], # entities: Optional[List[MessageEntity]] + custom_subs: Optional[Dict[str, str]] = None +) -> str: + # if not entities: + # return html.escape(text) + if not text: + return text + elif not entities: + return text.replace("&", "&").replace("<", "<").replace(">", ">") + + # --- Step 1: Map UTF-16 offsets to Python String Indices --- + # Telegram API uses UTF-16 code units for offsets/length. + # Python strings are indexed by Unicode code points. + # We need to map: utf16_offset -> python_string_index + + # Identify all 'significant' UTF-16 boundaries we care about (start and end of every entity) + boundaries = set() + for e in entities: + boundaries.add(e.offset) + boundaries.add(e.offset + e.length) + + # Sort them to iterate through the text linearly + sorted_boundaries = sorted(list(boundaries)) + boundary_map = {} # Maps utf16_offset -> python_index + + current_utf16_len = 0 + boundary_idx = 0 + + # Iterate over the string code point by code point + for py_index, char in enumerate(text): + # If we reached a boundary, record the mapping + while boundary_idx < len(sorted_boundaries) and current_utf16_len == sorted_boundaries[boundary_idx]: + boundary_map[sorted_boundaries[boundary_idx]] = py_index + boundary_idx += 1 + + if boundary_idx >= len(sorted_boundaries): + break + + # Advance UTF-16 counter + # BMP characters (<= 0xFFFF) take 1 unit. Non-BMP (surrogates) take 2 units. + if ord(char) > 0xFFFF: + current_utf16_len += 2 + else: + current_utf16_len += 1 + + # Handle boundaries that fall exactly at the end of the string + while boundary_idx < len(sorted_boundaries) and current_utf16_len == sorted_boundaries[boundary_idx]: + boundary_map[sorted_boundaries[boundary_idx]] = len(text) + boundary_idx += 1 + + # --- Step 2: Create Markers --- + # We transform entities into "Insert Start Tag" and "Insert End Tag" markers. + markers = [] + + for e in entities: + if e.offset not in boundary_map or (e.offset + e.length) not in boundary_map: + continue # Skip invalid entities + + start_py = boundary_map[e.offset] + end_py = boundary_map[e.offset + e.length] + + # Structure: (Index, Type, Priority, Entity) + # Type: 1 = Start Tag, 0 = End Tag. + # Priority: Used to ensure correct nesting (Outer tags wrap Inner tags). + # - For Start Tags (1): Larger length = Higher priority (Process earlier). + # We use negative length so 'smaller' number comes first in ASC sort. + # - For End Tags (0): Smaller length = Higher priority (Process earlier). + + # Start Marker + markers.append((start_py, 1, -e.length, e)) + + # End Marker + markers.append((end_py, 0, e.length, e)) + + # --- Step 3: Sort Markers --- + # Primary Key: Index (asc) + # Secondary Key: Type (End tags (0) before Start tags (1) at same index) -> This fixes vs + # Tertiary Key: Priority (Length based nesting) + + # FIX: We use a lambda key to avoid comparing the 'e' (MessageEntity) object directly + markers.sort(key=lambda x: (x[0], x[1], x[2])) + + # --- Step 4: Build HTML --- + result = [] + text_ptr = 0 + stack = [] # To track currently open entities + + for index, tag_type, _, entity in markers: + # 1. Append text leading up to this marker + if index > text_ptr: + result.append(html.escape(text[text_ptr:index])) + text_ptr = index + + # 2. Get the HTML tag representation + tag = get_html_tag(entity, custom_subs) + if not tag: + continue + + if tag_type == 1: # START TAG + result.append(tag['open']) + stack.append(entity) + + else: # END TAG + # If stack is empty (shouldn't happen in valid data), ignore + if not stack: + continue + + # If the entity to close is at the top of the stack, close it normally + if stack[-1] == entity: + result.append(tag['close']) + stack.pop() + else: + # INTERSECTING ENTITIES DETECTED + # We need to close everything down to our entity, then reopen them + if entity in stack: + temp_stack = [] + + # Pop and close until we find the target + while stack[-1] != entity: + top_entity = stack.pop() + top_tag = get_html_tag(top_entity, custom_subs) + if top_tag: + result.append(top_tag['close']) + temp_stack.append(top_entity) + + # Close the target entity + result.append(tag['close']) + stack.pop() + + # Re-open the temporarily closed entities (in reverse order to preserve nesting) + for popped_entity in reversed(temp_stack): + p_tag = get_html_tag(popped_entity, custom_subs) + if p_tag: + result.append(p_tag['open']) + stack.append(popped_entity) + + # Append remaining text + if text_ptr < len(text): + result.append(html.escape(text[text_ptr:])) + + return "".join(result) + + +def get_html_tag(entity, custom_subs: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]: # entity: MessageEntity + """Helper to get open/close tags based on entity type.""" + + # Check custom subs first (basic implementation: if type in dict, return it as open tag) + # Note: The prompt implies full substitutions, but simple key-value usually implies 'open' tag or full format. + # Given the complexity of closing tags, we stick to the Prompt's Rules for known types. + + t = entity.type + text_placeholder = "{text}" # Not used here directly, we just return tags + + if t == "bold": + return {'open': "", 'close': ""} + elif t == "italic": + return {'open': "", 'close': ""} + elif t == "underline": + return {'open': "", 'close': ""} + elif t == "strikethrough": + return {'open': "", 'close': ""} + elif t == "spoiler": + return {'open': '', 'close': ""} + elif t == "code": + return {'open': "", 'close': ""} + elif t == "pre": + return {'open': "
", 'close': "
"} + elif t == "blockquote": + return {'open': "
", 'close': "
"} + elif t == "expandable_blockquote": + return {'open': "
", 'close': "
"} + elif t == "text_link": + return {'open': f'', 'close': ""} + elif t == "custom_emoji": + return {'open': f'', 'close': ""} + elif t in custom_subs: + return None # Custom subs are not handled in this tag-based approach + + return None +#endregion + +#region ChatGPT vibecoding here +ENTITY_TEMPLATES_CG = { + "bold": "{text}", + "italic": "{text}", + "pre": "
{text}
", + "code": "{text}", + "text_link": "{text}", + "strikethrough": "{text}", + "underline": "{text}", + "spoiler": "{text}", + "custom_emoji": "{text}", + "blockquote": "
{text}
", + "expandable_blockquote": "
{text}
", +} + +def utf16_index_map(s: str) -> List[int]: + """ + Map UTF-16 code unit index -> Python string index. + Result length = utf16_len + 1 + """ + mapping = [0] + u16 = 0 + for i, ch in enumerate(s): + code = ord(ch) + u16 += 2 if code > 0xFFFF else 1 + while len(mapping) <= u16: + mapping.append(i + 1) + return mapping + +def apply_template(entity, inner: str, custom_subs: Optional[Dict[str, str]]) -> str: + t = entity.type + if t in ENTITY_TEMPLATES_CG: + tpl = ENTITY_TEMPLATES_CG[t] + elif custom_subs and t in custom_subs: + tpl = custom_subs[t] + else: + return inner + + data = {"text": inner} + + if t == "text_link": + data["url"] = getattr(entity, "url", "") + if t == "custom_emoji": + data["custom_emoji_id"] = getattr(entity, "custom_emoji_id", "") + + return tpl.format(**data) + +def build_tree(text: str, entities: List, mapping: List[int]): + nodes = [] + + for e in entities: + start16 = e.offset + end16 = e.offset + e.length + + start = mapping[start16] + end = mapping[end16] + + nodes.append({ + "entity": e, + "start": start, + "end": end, + "children": [] + }) + + nodes.sort(key=lambda n: (n["start"], -n["end"])) + + stack = [] + roots = [] + + for n in nodes: + while stack and n["start"] >= stack[-1]["end"]: + stack.pop() + + if stack: + stack[-1]["children"].append(n) + else: + roots.append(n) + + stack.append(n) + + return roots + +def render(text: str, nodes, custom_subs): + result = [] + pos = 0 + + for n in nodes: + result.append(text[pos:n["start"]]) + + inner = render( + text[n["start"]:n["end"]], + shift_nodes(n["children"], n["start"]), + custom_subs + ) + + wrapped = apply_template(n["entity"], inner, custom_subs) + result.append(wrapped) + + pos = n["end"] + + result.append(text[pos:]) + return "".join(result) + +def shift_nodes(nodes, shift): + out = [] + for n in nodes: + out.append({ + "entity": n["entity"], + "start": n["start"] - shift, + "end": n["end"] - shift, + "children": shift_nodes(n["children"], shift), + }) + return out + +def apply_html_entities_cg( + text: str, + entities: Optional[List], + custom_subs: Optional[Dict[str, str]] +) -> str: + if not text: + return text + elif not entities: + return text.replace("&", "&").replace("<", "<").replace(">", ">") + + mapping = utf16_index_map(text) + tree = build_tree(text, entities, mapping) + return render(text, tree, custom_subs) +#endregion From 9931d19dc0e454ea4d6217e49768683d852f10b1 Mon Sep 17 00:00:00 2001 From: Badiboy Date: Sat, 14 Feb 2026 19:39:21 +0300 Subject: [PATCH 2/5] Update telebot/formatting.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- telebot/formatting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/telebot/formatting.py b/telebot/formatting.py index 343c1be78..c2b140916 100644 --- a/telebot/formatting.py +++ b/telebot/formatting.py @@ -599,7 +599,7 @@ def create_html_tag(self, entity, content: str) -> str: # entity: MessageEntity # Prepare format arguments format_args = {"text": content} if entity_type == "text_link": - format_args["url"] = entity.url or "" + format_args["url"] = escape_html(entity.url or "") elif entity_type == "custom_emoji": format_args["custom_emoji_id"] = entity.custom_emoji_id or "" From c46d91cc3a42e37ed3389ff8bff8606f04c91f3f Mon Sep 17 00:00:00 2001 From: Badiboy Date: Sat, 14 Feb 2026 19:40:11 +0300 Subject: [PATCH 3/5] Update telebot/formatting.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- telebot/formatting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/telebot/formatting.py b/telebot/formatting.py index c2b140916..7e0564ba8 100644 --- a/telebot/formatting.py +++ b/telebot/formatting.py @@ -614,7 +614,7 @@ def apply_html_entities_ds(text: str, entities: Optional[List], # enti Args: text: Plain text message entities: List of MessageEntity objects - custom_subs: Optional custom substitutions (not used in this implementation) + custom_subs: Optional mapping of entity types to custom HTML substitutions/templates. Returns: HTML formatted string From e71061181a1f2eb792afed47c2f409fe3da33be8 Mon Sep 17 00:00:00 2001 From: Badiboy Date: Sat, 14 Feb 2026 21:05:52 +0300 Subject: [PATCH 4/5] Fix issues, apply CoPilot hints --- telebot/formatting.py | 113 ++++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 55 deletions(-) diff --git a/telebot/formatting.py b/telebot/formatting.py index 343c1be78..1b18ad97b 100644 --- a/telebot/formatting.py +++ b/telebot/formatting.py @@ -4,7 +4,7 @@ import re import html -from typing import Optional, List, Dict, Tuple +from typing import Optional, List, Dict # Alternative message entities parsers. Can be: @@ -12,7 +12,7 @@ # "gemini" - gemini version # "chatgpt" - chatgpt version # other values - original version -ENTITY_PASER_MODE = None +ENTITY_PARSER_MODE = None def format_text(*args, separator="\n"): @@ -51,6 +51,7 @@ def escape_html(content: str) -> str: return html.escape(content) +# noinspection RegExpRedundantEscape def escape_markdown(content: str) -> str: """ Escapes Markdown characters in a string of Markdown. @@ -275,6 +276,9 @@ def mcode(content: str, language: str="", escape: Optional[bool]=True) -> str: :param content: The string to code. :type content: :obj:`str` + :param language: The programming language of the code. Defaults to an empty string. + :type language: :obj:`str` + :param escape: True if you need to escape special characters. Defaults to True. :type escape: :obj:`bool` @@ -310,6 +314,9 @@ def hpre(content: str, escape: Optional[bool]=True, language: str="") -> str: :param escape: True if you need to escape special characters. Defaults to True. :type escape: :obj:`bool` + :param language: The programming language of the code. Defaults to an empty string. + :type language: :obj:`str` + :return: The formatted string. :rtype: :obj:`str` """ @@ -398,11 +405,11 @@ def apply_html_entities(text: str, entities: Optional[List], custom_subs: Option ) >> "Test parse formatting, url and text_mention and mention @username" """ - if ENTITY_PASER_MODE == "deepseek": + if ENTITY_PARSER_MODE == "deepseek": return apply_html_entities_ds(text, entities, custom_subs) - elif ENTITY_PASER_MODE == "gemini": + elif ENTITY_PARSER_MODE == "gemini": return apply_html_entities_gm(text, entities, custom_subs) - elif ENTITY_PASER_MODE == "chatgpt": + elif ENTITY_PARSER_MODE == "chatgpt": return apply_html_entities_cg(text, entities, custom_subs) if not entities: @@ -433,8 +440,8 @@ def func(upd_text, subst_type=None, url=None, user=None, custom_emoji_id=None, l if subst_type == "text_mention": subst_type = "text_link" url = "tg://user?id={0}".format(user.id) - elif subst_type == "mention": - url = "https://t.me/{0}".format(upd_text[1:]) + # elif subst_type == "mention": + # url = "https://t.me/{0}".format(upd_text[1:]) upd_text = upd_text.replace("&", "&").replace("<", "<").replace(">", ">") if not subst_type or not _subs.get(subst_type): return upd_text @@ -512,7 +519,7 @@ class EntityProcessor: def __init__(self, text: str, custom_subs: Optional[Dict[str, str]] = None): self.text = text - self.utf16_mapping, self.char_to_units = self.utf16_code_units_to_indices(text) + self.utf16_mapping = self.utf16_code_units_to_indices(text) self.total_utf16_units = len(self.utf16_mapping) self.custom_subs = custom_subs @@ -534,16 +541,14 @@ def get_entity_template(self, entity_type: str, default: Optional[str] = None) - return default @staticmethod - def utf16_code_units_to_indices(text: str) -> Tuple[List[int], List[int]]: + def utf16_code_units_to_indices(text: str) -> List[int]: """ Convert UTF-16 code unit positions to Python string indices. Returns: - - code_unit_to_char_idx: Mapping from UTF-16 code unit position to character index - - char_idx_to_code_units: Number of UTF-16 code units per character + code_unit_to_char_idx: Mapping from UTF-16 code unit position to character index """ code_unit_to_char_idx = [] - char_idx_to_code_units = [] code_unit_pos = 0 for char_idx, char in enumerate(text): @@ -558,10 +563,9 @@ def utf16_code_units_to_indices(text: str) -> Tuple[List[int], List[int]]: for _ in range(code_units): code_unit_to_char_idx.append(char_idx) - char_idx_to_code_units.append(code_units) code_unit_pos += code_units - return code_unit_to_char_idx, char_idx_to_code_units + return code_unit_to_char_idx def utf16_to_char_index(self, utf16_pos: int) -> int: """ @@ -585,23 +589,21 @@ def create_html_tag(self, entity, content: str) -> str: # entity: MessageEntity """ entity_type = entity.type - # if entity_type in self.ENTITY_TEMPLATES: - # template = self.ENTITY_TEMPLATES[entity_type] - # elif self.custom_subs and (entity_type in self.custom_subs): - # template = self.custom_subs[entity_type] - # else: - # # If no template is defined for this entity type, return the content as is - # return content template = self.get_entity_template(entity_type) if not template: return content # Prepare format arguments format_args = {"text": content} - if entity_type == "text_link": - format_args["url"] = entity.url or "" + if entity_type == "text_mention": + template = self.get_entity_template("text_link") + format_args["url"] = "tg://user?id={0}".format(entity.user.id) + elif entity_type == "text_link": + format_args["url"] = escape_html(entity.url or "") elif entity_type == "custom_emoji": format_args["custom_emoji_id"] = entity.custom_emoji_id or "" + elif entity_type == "pre" and entity.language: + format_args["text"] = '{}'.format(entity.language, format_args["text"]) return template.format(**format_args) @@ -614,7 +616,7 @@ def apply_html_entities_ds(text: str, entities: Optional[List], # enti Args: text: Plain text message entities: List of MessageEntity objects - custom_subs: Optional custom substitutions (not used in this implementation) + custom_subs: Optional mapping of entity types to custom HTML substitutions/templates. Returns: HTML formatted string @@ -624,7 +626,7 @@ def apply_html_entities_ds(text: str, entities: Optional[List], # enti elif not entities: return text.replace("&", "&").replace("<", "<").replace(">", ">") - processor = EntityProcessor(text) + processor = EntityProcessor(text, custom_subs=custom_subs) # Sort entities by their position in the text # For proper nesting handling, we need to process from the end @@ -634,12 +636,6 @@ def apply_html_entities_ds(text: str, entities: Optional[List], # enti # First, convert UTF-16 offsets to character indices for easier processing entity_ranges = [] for entity in sorted_entities: - # if entity.type in processor.ENTITY_TEMPLATES: - # pass - # elif processor.custom_subs and (entity.type in processor.custom_subs): - # pass - # else: - # continue if not processor.check_entity_exists(entity.type): continue @@ -651,7 +647,6 @@ def apply_html_entities_ds(text: str, entities: Optional[List], # enti 'start': start_char, 'end': end_char, 'type': entity.type, - 'processed': False }) # Sort by start position (ascending) and then by length (descending) @@ -675,34 +670,34 @@ def process_range(start_idx: int, end_idx: int, entities_in_range: List[dict]) - i = 0 while i < len(entities_in_range): - entity = entities_in_range[i] + cur_entity = entities_in_range[i] # Add text before this entity - if entity['start'] > current_pos: - result_parts.append(text[current_pos:entity['start']]) + if cur_entity['start'] > current_pos: + result_parts.append(text[current_pos:cur_entity['start']]) # Find all entities that start at the same position or are nested within nested_entities = [] j = i - while j < len(entities_in_range) and entities_in_range[j]['start'] < entity['end']: - if entities_in_range[j]['start'] >= entity['start']: + while j < len(entities_in_range) and entities_in_range[j]['start'] < cur_entity['end']: + if entities_in_range[j]['start'] >= cur_entity['start']: nested_entities.append(entities_in_range[j]) j += 1 # Filter entities that are actually within this entity's range nested_entities = [e for e in nested_entities if - e['start'] >= entity['start'] and e['end'] <= entity['end']] + e['start'] >= cur_entity['start'] and e['end'] <= cur_entity['end']] # Process the content of this entity (including nested entities) - content = process_range(entity['start'], entity['end'], - [e for e in nested_entities if e != entity]) + content = process_range(cur_entity['start'], cur_entity['end'], + [e for e in nested_entities if e != cur_entity]) # Apply this entity's HTML tag - html_content = processor.create_html_tag(entity['entity'], content) + html_content = processor.create_html_tag(cur_entity['entity'], content) result_parts.append(html_content) # Move current position to the end of this entity - current_pos = entity['end'] + current_pos = cur_entity['end'] i = j # Add remaining text @@ -716,11 +711,6 @@ def process_range(start_idx: int, end_idx: int, entities_in_range: List[dict]) - #endregion #region Gemini vibecoding here -# import sys -# import html -# from typing import List, Optional, Dict - -# 2. Main Function def apply_html_entities_gm( text: str, entities: Optional[List], # entities: Optional[List[MessageEntity]] @@ -874,8 +864,6 @@ def get_html_tag(entity, custom_subs: Optional[Dict[str, str]]) -> Optional[Dict # Given the complexity of closing tags, we stick to the Prompt's Rules for known types. t = entity.type - text_placeholder = "{text}" # Not used here directly, we just return tags - if t == "bold": return {'open': "", 'close': ""} elif t == "italic": @@ -888,6 +876,8 @@ def get_html_tag(entity, custom_subs: Optional[Dict[str, str]]) -> Optional[Dict return {'open': '', 'close': ""} elif t == "code": return {'open': "", 'close': ""} + elif (t == "pre") and entity.language: + return {'open': f'
', 'close': "
"} elif t == "pre": return {'open': "
", 'close': "
"} elif t == "blockquote": @@ -896,10 +886,19 @@ def get_html_tag(entity, custom_subs: Optional[Dict[str, str]]) -> Optional[Dict return {'open': "
", 'close': "
"} elif t == "text_link": return {'open': f'', 'close': ""} + elif t == "text_mention": + return {'open': f'', 'close': ""} elif t == "custom_emoji": return {'open': f'', 'close': ""} - elif t in custom_subs: - return None # Custom subs are not handled in this tag-based approach + elif custom_subs and (t in custom_subs): + # Support custom substitutions by splitting the template around the {text} placeholder + template = custom_subs[t] + if "{text}" in template: + open_part, close_part = template.split("{text}", 1) + else: + # If no {text} placeholder is present, treat the entire template as the opening part + open_part, close_part = template, "" + return {'open': open_part, 'close': close_part} return None #endregion @@ -946,12 +945,16 @@ def apply_template(entity, inner: str, custom_subs: Optional[Dict[str, str]]) -> if t == "text_link": data["url"] = getattr(entity, "url", "") - if t == "custom_emoji": + elif t == "text_mention": + data["url"] = f"tg://user?id={getattr(entity, 'user', {}).id if getattr(entity, 'user', None) else ''}" + elif t == "custom_emoji": data["custom_emoji_id"] = getattr(entity, "custom_emoji_id", "") + elif (t == "pre") and getattr(entity, "language", None): + data["text"] = f'{inner}' return tpl.format(**data) -def build_tree(text: str, entities: List, mapping: List[int]): +def build_tree(entities: List, mapping: List[int]): nodes = [] for e in entities: @@ -968,7 +971,7 @@ def build_tree(text: str, entities: List, mapping: List[int]): "children": [] }) - nodes.sort(key=lambda n: (n["start"], -n["end"])) + nodes.sort(key=lambda node: (node["start"], -node["end"])) stack = [] roots = [] @@ -1029,6 +1032,6 @@ def apply_html_entities_cg( return text.replace("&", "&").replace("<", "<").replace(">", ">") mapping = utf16_index_map(text) - tree = build_tree(text, entities, mapping) + tree = build_tree(entities, mapping) return render(text, tree, custom_subs) #endregion From ab8d1ad9818cd73e3eb25d4e5691741b864f03dc Mon Sep 17 00:00:00 2001 From: Badiboy Date: Sat, 14 Feb 2026 21:47:41 +0300 Subject: [PATCH 5/5] Added 4th alternative. Manually coded ) --- telebot/formatting.py | 123 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/telebot/formatting.py b/telebot/formatting.py index 1b18ad97b..35f624e54 100644 --- a/telebot/formatting.py +++ b/telebot/formatting.py @@ -11,6 +11,7 @@ # "deepseek" - deepseek version # "gemini" - gemini version # "chatgpt" - chatgpt version +# "coder" - @coder2020official version # other values - original version ENTITY_PARSER_MODE = None @@ -411,6 +412,8 @@ def apply_html_entities(text: str, entities: Optional[List], custom_subs: Option return apply_html_entities_gm(text, entities, custom_subs) elif ENTITY_PARSER_MODE == "chatgpt": return apply_html_entities_cg(text, entities, custom_subs) + elif ENTITY_PARSER_MODE == "coder": + return apply_html_entities_coder(text, entities, custom_subs) if not entities: return text.replace("&", "&").replace("<", "<").replace(">", ">") @@ -1035,3 +1038,123 @@ def apply_html_entities_cg( tree = build_tree(entities, mapping) return render(text, tree, custom_subs) #endregion + +def apply_html_entities_coder(text: str, entities=None, custom_subs=None) -> str: + """ + Apply HTML formatting to text based on provided entities. + Handles nested and overlapping entities correctly. + """ + if not entities: + return text.replace("&", "&").replace("<", "<").replace(">", ">") + + _subs = { + "bold": "{text}", + "italic": "{text}", + "pre": "
{text}
", + "code": "{text}", + "text_link": "{text}", + "strikethrough": "{text}", + "underline": "{text}", + "spoiler": "{text}", + "custom_emoji": "{text}", + "blockquote": "
{text}
", + "expandable_blockquote": "
{text}
", + } + + if custom_subs: + for key, value in custom_subs.items(): + _subs[key] = value + + # Sort entities by offset (starting position), with longer entities first for equal offsets + sorted_entities = sorted(entities, key=lambda e: (e.offset, -e.length)) + + # Convert text to utf-16 encoding for proper handling + utf16_text = text.encode("utf-16-le") + + def escape_html(text_part): + """Escape HTML special characters in a text part""" + if isinstance(text_part, bytes): + text_part = text_part.decode("utf-16-le") + return text_part.replace("&", "&").replace("<", "<").replace(">", ">") + + def format_entity(entity, content): + """Apply entity formatting to the content""" + entity_type = entity.type + + # Handle different entity types + if entity_type == "text_mention" and hasattr(entity, 'user'): + return f"{content}" + # elif entity_type == "mention": # No need to do this, @username works fine + # username = content[1:] # Remove @ symbol + # return f"{content}" + elif entity_type == "text_link" and hasattr(entity, 'url'): + return f"{content}" + elif entity_type == "custom_emoji" and hasattr(entity, 'custom_emoji_id'): + return f"{content}" + elif (entity_type == "pre" and hasattr(entity, 'language') and entity.language): + return f"
{content}
" + elif entity_type in _subs: + template = _subs[entity_type] + return template.format(text=content) + + # If no matching entity type, return text as is + return content + + def process_entities(byte_text, entity_list, start_pos=0, end_pos=None): + if end_pos is None: + end_pos = len(byte_text) + + if not entity_list or start_pos >= end_pos: + return escape_html(byte_text[start_pos:end_pos]) + + current_entity = entity_list[0] + current_start = current_entity.offset * 2 + current_end = current_start + current_entity.length * 2 + + if current_end <= start_pos or current_start >= end_pos: + return escape_html(byte_text[start_pos:end_pos]) + + result = [] + + if current_start > start_pos: + result.append(escape_html(byte_text[start_pos:current_start])) + + nested_entities = [] + remaining_entities = [] + + for entity in entity_list[1:]: + entity_start = entity.offset * 2 + # entity_end = entity_start + entity.length * 2 + + if entity_start >= current_start and entity_start < current_end: + nested_entities.append(entity) + else: + remaining_entities.append(entity) + + if nested_entities: + inner_content = process_entities( + byte_text, + nested_entities, + current_start, + current_end + ) + else: + inner_content = escape_html(byte_text[current_start:current_end]) + + result.append(format_entity(current_entity, inner_content)) + + if current_end < end_pos and remaining_entities: + result.append(process_entities( + byte_text, + remaining_entities, + current_end, + end_pos + )) + elif current_end < end_pos: + result.append(escape_html(byte_text[current_end:end_pos])) + + return "".join(result) + + html_result = process_entities(utf16_text, sorted_entities) + + return html_result