diff --git a/telebot/formatting.py b/telebot/formatting.py index 0e300f7c1..35f624e54 100644 --- a/telebot/formatting.py +++ b/telebot/formatting.py @@ -1,7 +1,5 @@ """ Markdown & HTML formatting functions. - -.. versionadded:: 4.5.1 """ import re @@ -9,6 +7,15 @@ from typing import Optional, List, Dict +# Alternative message entities parsers. Can be: +# "deepseek" - deepseek version +# "gemini" - gemini version +# "chatgpt" - chatgpt version +# "coder" - @coder2020official version +# other values - original version +ENTITY_PARSER_MODE = None + + def format_text(*args, separator="\n"): """ Formats a list of strings into a single string. @@ -45,6 +52,7 @@ def escape_html(content: str) -> str: return html.escape(content) +# noinspection RegExpRedundantEscape def escape_markdown(content: str) -> str: """ Escapes Markdown characters in a string of Markdown. @@ -269,6 +277,9 @@ def mcode(content: str, language: str="", escape: Optional[bool]=True) -> str: :param content: The string to code. :type content: :obj:`str` + :param language: The programming language of the code. Defaults to an empty string. + :type language: :obj:`str` + :param escape: True if you need to escape special characters. Defaults to True. :type escape: :obj:`bool` @@ -304,6 +315,9 @@ def hpre(content: str, escape: Optional[bool]=True, language: str="") -> str: :param escape: True if you need to escape special characters. Defaults to True. :type escape: :obj:`bool` + :param language: The programming language of the code. Defaults to an empty string. + :type language: :obj:`str` + :return: The formatted string. :rtype: :obj:`str` """ @@ -392,6 +406,14 @@ def apply_html_entities(text: str, entities: Optional[List], custom_subs: Option ) >> "Test parse formatting, url and text_mention and mention @username" """ + if ENTITY_PARSER_MODE == "deepseek": + return apply_html_entities_ds(text, entities, custom_subs) + elif ENTITY_PARSER_MODE == "gemini": + return apply_html_entities_gm(text, entities, custom_subs) + elif ENTITY_PARSER_MODE == "chatgpt": + return apply_html_entities_cg(text, entities, custom_subs) + elif ENTITY_PARSER_MODE == "coder": + return apply_html_entities_coder(text, entities, custom_subs) if not entities: return text.replace("&", "&").replace("<", "<").replace(">", ">") @@ -401,7 +423,6 @@ def apply_html_entities(text: str, entities: Optional[List], custom_subs: Option "italic": "{text}", "pre": "
{text}",
"code": "{text}",
- # "url": "{text}", # @badiboy plain URLs have no text and do not need tags
"text_link": "{text}",
"strikethrough": "{text}", "expandable_blockquote": "
{text}", - } if custom_subs: @@ -423,8 +443,8 @@ def func(upd_text, subst_type=None, url=None, user=None, custom_emoji_id=None, l if subst_type == "text_mention": subst_type = "text_link" url = "tg://user?id={0}".format(user.id) - elif subst_type == "mention": - url = "https://t.me/{0}".format(upd_text[1:]) + # elif subst_type == "mention": + # url = "https://t.me/{0}".format(upd_text[1:]) upd_text = upd_text.replace("&", "&").replace("<", "<").replace(">", ">") if not subst_type or not _subs.get(subst_type): return upd_text @@ -477,3 +497,664 @@ def func(upd_text, subst_type=None, url=None, user=None, custom_emoji_id=None, l html_text += func(utf16_text[offset * 2:]) return html_text + + +#region DeepSeek vibecoding here +class EntityProcessor: + """ + Handles parsing of text with message entities to HTML. + """ + + # Entity type to HTML template mapping + ENTITY_TEMPLATES = { + "bold": "{text}", + "italic": "{text}", + "pre": "
{text}",
+ "code": "{text}",
+ "text_link": "{text}",
+ "strikethrough": "{text}", + "expandable_blockquote": "
{text}", + } + + def __init__(self, text: str, custom_subs: Optional[Dict[str, str]] = None): + self.text = text + self.utf16_mapping = self.utf16_code_units_to_indices(text) + self.total_utf16_units = len(self.utf16_mapping) + self.custom_subs = custom_subs + + def check_entity_exists(self, entity_type: str) -> bool: + """ + Check if an entity type has a defined HTML template, considering custom substitutions. + """ + return (entity_type in self.ENTITY_TEMPLATES) or (self.custom_subs and (entity_type in self.custom_subs)) + + def get_entity_template(self, entity_type: str, default: Optional[str] = None) -> Optional[str]: + """ + Get the HTML template for a given entity type, considering custom substitutions. + """ + if entity_type in self.ENTITY_TEMPLATES: + return self.ENTITY_TEMPLATES[entity_type] + elif self.custom_subs and (entity_type in self.custom_subs): + return self.custom_subs[entity_type] + else: + return default + + @staticmethod + def utf16_code_units_to_indices(text: str) -> List[int]: + """ + Convert UTF-16 code unit positions to Python string indices. + + Returns: + code_unit_to_char_idx: Mapping from UTF-16 code unit position to character index + """ + code_unit_to_char_idx = [] + + code_unit_pos = 0 + for char_idx, char in enumerate(text): + code_point = ord(char) + # Characters outside BMP (U+10000 to U+10FFFF) use 2 UTF-16 code units + if code_point >= 0x10000: + code_units = 2 + else: + code_units = 1 + + # Map this code unit position to character index + for _ in range(code_units): + code_unit_to_char_idx.append(char_idx) + + code_unit_pos += code_units + + return code_unit_to_char_idx + + def utf16_to_char_index(self, utf16_pos: int) -> int: + """ + Convert UTF-16 code unit position to character index. + """ + if utf16_pos >= len(self.utf16_mapping): + return len(self.text) + return self.utf16_mapping[utf16_pos] + + def get_entity_text(self, entity) -> str: # entity: MessageEntity + """ + Extract the text for an entity using UTF-16 code unit offsets. + """ + start_char = self.utf16_to_char_index(entity.offset) + end_char = self.utf16_to_char_index(entity.offset + entity.length) + return self.text[start_char:end_char] + + def create_html_tag(self, entity, content: str) -> str: # entity: MessageEntity + """ + Create HTML tag for an entity with the given content. + """ + entity_type = entity.type + + template = self.get_entity_template(entity_type) + if not template: + return content + + # Prepare format arguments + format_args = {"text": content} + if entity_type == "text_mention": + template = self.get_entity_template("text_link") + format_args["url"] = "tg://user?id={0}".format(entity.user.id) + elif entity_type == "text_link": + format_args["url"] = escape_html(entity.url or "") + elif entity_type == "custom_emoji": + format_args["custom_emoji_id"] = entity.custom_emoji_id or "" + elif entity_type == "pre" and entity.language: + format_args["text"] = '
{}'.format(entity.language, format_args["text"])
+
+ return template.format(**format_args)
+
+def apply_html_entities_ds(text: str, entities: Optional[List], # entities: Optional[List[MessageEntity]]
+ custom_subs: Optional[Dict[str, str]] = None) -> str:
+ """
+ Parse text message to HTML code according to message entities.
+ Properly handles UTF-16 code units for offsets and nested entities.
+
+ Args:
+ text: Plain text message
+ entities: List of MessageEntity objects
+ custom_subs: Optional mapping of entity types to custom HTML substitutions/templates.
+
+ Returns:
+ HTML formatted string
+ """
+ if not text:
+ return text
+ elif not entities:
+ return text.replace("&", "&").replace("<", "<").replace(">", ">")
+
+ processor = EntityProcessor(text, custom_subs=custom_subs)
+
+ # Sort entities by their position in the text
+ # For proper nesting handling, we need to process from the end
+ sorted_entities = sorted(entities, key=lambda e: e.offset, reverse=True)
+
+ # Build a tree structure of entities
+ # First, convert UTF-16 offsets to character indices for easier processing
+ entity_ranges = []
+ for entity in sorted_entities:
+ if not processor.check_entity_exists(entity.type):
+ continue
+
+ start_char = processor.utf16_to_char_index(entity.offset)
+ end_char = processor.utf16_to_char_index(entity.offset + entity.length)
+
+ entity_ranges.append({
+ 'entity': entity,
+ 'start': start_char,
+ 'end': end_char,
+ 'type': entity.type,
+ })
+
+ # Sort by start position (ascending) and then by length (descending)
+ # This ensures parent entities come before children
+ entity_ranges.sort(key=lambda x: (x['start'], -x['end']))
+
+ # Build the HTML recursively
+ def process_range(start_idx: int, end_idx: int, entities_in_range: List[dict]) -> str:
+ """
+ Recursively process a text range with its entities.
+ """
+ if not entities_in_range:
+ return text[start_idx:end_idx]
+
+ # Group entities by their start position
+ result_parts = []
+ current_pos = start_idx
+
+ # Sort entities by their start position
+ entities_in_range.sort(key=lambda x: x['start'])
+
+ i = 0
+ while i < len(entities_in_range):
+ cur_entity = entities_in_range[i]
+
+ # Add text before this entity
+ if cur_entity['start'] > current_pos:
+ result_parts.append(text[current_pos:cur_entity['start']])
+
+ # Find all entities that start at the same position or are nested within
+ nested_entities = []
+ j = i
+ while j < len(entities_in_range) and entities_in_range[j]['start'] < cur_entity['end']:
+ if entities_in_range[j]['start'] >= cur_entity['start']:
+ nested_entities.append(entities_in_range[j])
+ j += 1
+
+ # Filter entities that are actually within this entity's range
+ nested_entities = [e for e in nested_entities if
+ e['start'] >= cur_entity['start'] and e['end'] <= cur_entity['end']]
+
+ # Process the content of this entity (including nested entities)
+ content = process_range(cur_entity['start'], cur_entity['end'],
+ [e for e in nested_entities if e != cur_entity])
+
+ # Apply this entity's HTML tag
+ html_content = processor.create_html_tag(cur_entity['entity'], content)
+ result_parts.append(html_content)
+
+ # Move current position to the end of this entity
+ current_pos = cur_entity['end']
+ i = j
+
+ # Add remaining text
+ if current_pos < end_idx:
+ result_parts.append(text[current_pos:end_idx])
+
+ return ''.join(result_parts)
+
+ # Process the entire text
+ return process_range(0, len(text), entity_ranges)
+#endregion
+
+#region Gemini vibecoding here
+def apply_html_entities_gm(
+ text: str,
+ entities: Optional[List], # entities: Optional[List[MessageEntity]]
+ custom_subs: Optional[Dict[str, str]] = None
+) -> str:
+ # if not entities:
+ # return html.escape(text)
+ if not text:
+ return text
+ elif not entities:
+ return text.replace("&", "&").replace("<", "<").replace(">", ">")
+
+ # --- Step 1: Map UTF-16 offsets to Python String Indices ---
+ # Telegram API uses UTF-16 code units for offsets/length.
+ # Python strings are indexed by Unicode code points.
+ # We need to map: utf16_offset -> python_string_index
+
+ # Identify all 'significant' UTF-16 boundaries we care about (start and end of every entity)
+ boundaries = set()
+ for e in entities:
+ boundaries.add(e.offset)
+ boundaries.add(e.offset + e.length)
+
+ # Sort them to iterate through the text linearly
+ sorted_boundaries = sorted(list(boundaries))
+ boundary_map = {} # Maps utf16_offset -> python_index
+
+ current_utf16_len = 0
+ boundary_idx = 0
+
+ # Iterate over the string code point by code point
+ for py_index, char in enumerate(text):
+ # If we reached a boundary, record the mapping
+ while boundary_idx < len(sorted_boundaries) and current_utf16_len == sorted_boundaries[boundary_idx]:
+ boundary_map[sorted_boundaries[boundary_idx]] = py_index
+ boundary_idx += 1
+
+ if boundary_idx >= len(sorted_boundaries):
+ break
+
+ # Advance UTF-16 counter
+ # BMP characters (<= 0xFFFF) take 1 unit. Non-BMP (surrogates) take 2 units.
+ if ord(char) > 0xFFFF:
+ current_utf16_len += 2
+ else:
+ current_utf16_len += 1
+
+ # Handle boundaries that fall exactly at the end of the string
+ while boundary_idx < len(sorted_boundaries) and current_utf16_len == sorted_boundaries[boundary_idx]:
+ boundary_map[sorted_boundaries[boundary_idx]] = len(text)
+ boundary_idx += 1
+
+ # --- Step 2: Create Markers ---
+ # We transform entities into "Insert Start Tag" and "Insert End Tag" markers.
+ markers = []
+
+ for e in entities:
+ if e.offset not in boundary_map or (e.offset + e.length) not in boundary_map:
+ continue # Skip invalid entities
+
+ start_py = boundary_map[e.offset]
+ end_py = boundary_map[e.offset + e.length]
+
+ # Structure: (Index, Type, Priority, Entity)
+ # Type: 1 = Start Tag, 0 = End Tag.
+ # Priority: Used to ensure correct nesting (Outer tags wrap Inner tags).
+ # - For Start Tags (1): Larger length = Higher priority (Process earlier).
+ # We use negative length so 'smaller' number comes first in ASC sort.
+ # - For End Tags (0): Smaller length = Higher priority (Process earlier).
+
+ # Start Marker
+ markers.append((start_py, 1, -e.length, e))
+
+ # End Marker
+ markers.append((end_py, 0, e.length, e))
+
+ # --- Step 3: Sort Markers ---
+ # Primary Key: Index (asc)
+ # Secondary Key: Type (End tags (0) before Start tags (1) at same index) -> This fixes vs
+ # Tertiary Key: Priority (Length based nesting)
+
+ # FIX: We use a lambda key to avoid comparing the 'e' (MessageEntity) object directly
+ markers.sort(key=lambda x: (x[0], x[1], x[2]))
+
+ # --- Step 4: Build HTML ---
+ result = []
+ text_ptr = 0
+ stack = [] # To track currently open entities
+
+ for index, tag_type, _, entity in markers:
+ # 1. Append text leading up to this marker
+ if index > text_ptr:
+ result.append(html.escape(text[text_ptr:index]))
+ text_ptr = index
+
+ # 2. Get the HTML tag representation
+ tag = get_html_tag(entity, custom_subs)
+ if not tag:
+ continue
+
+ if tag_type == 1: # START TAG
+ result.append(tag['open'])
+ stack.append(entity)
+
+ else: # END TAG
+ # If stack is empty (shouldn't happen in valid data), ignore
+ if not stack:
+ continue
+
+ # If the entity to close is at the top of the stack, close it normally
+ if stack[-1] == entity:
+ result.append(tag['close'])
+ stack.pop()
+ else:
+ # INTERSECTING ENTITIES DETECTED
+ # We need to close everything down to our entity, then reopen them
+ if entity in stack:
+ temp_stack = []
+
+ # Pop and close until we find the target
+ while stack[-1] != entity:
+ top_entity = stack.pop()
+ top_tag = get_html_tag(top_entity, custom_subs)
+ if top_tag:
+ result.append(top_tag['close'])
+ temp_stack.append(top_entity)
+
+ # Close the target entity
+ result.append(tag['close'])
+ stack.pop()
+
+ # Re-open the temporarily closed entities (in reverse order to preserve nesting)
+ for popped_entity in reversed(temp_stack):
+ p_tag = get_html_tag(popped_entity, custom_subs)
+ if p_tag:
+ result.append(p_tag['open'])
+ stack.append(popped_entity)
+
+ # Append remaining text
+ if text_ptr < len(text):
+ result.append(html.escape(text[text_ptr:]))
+
+ return "".join(result)
+
+
+def get_html_tag(entity, custom_subs: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]: # entity: MessageEntity
+ """Helper to get open/close tags based on entity type."""
+
+ # Check custom subs first (basic implementation: if type in dict, return it as open tag)
+ # Note: The prompt implies full substitutions, but simple key-value usually implies 'open' tag or full format.
+ # Given the complexity of closing tags, we stick to the Prompt's Rules for known types.
+
+ t = entity.type
+ if t == "bold":
+ return {'open': "", 'close': ""}
+ elif t == "italic":
+ return {'open': "", 'close': ""}
+ elif t == "underline":
+ return {'open': "", 'close': ""}
+ elif t == "strikethrough":
+ return {'open': "", 'close': ""}
+ elif (t == "pre") and entity.language:
+ return {'open': f'', 'close': ""}
+ elif t == "pre":
+ return {'open': "", 'close': ""} + elif t == "blockquote": + return {'open': "
", 'close': ""} + elif t == "expandable_blockquote": + return {'open': "
", 'close': ""} + elif t == "text_link": + return {'open': f'', 'close': ""} + elif t == "text_mention": + return {'open': f'', 'close': ""} + elif t == "custom_emoji": + return {'open': f'
{text}",
+ "code": "{text}",
+ "text_link": "{text}",
+ "strikethrough": "{text}", + "expandable_blockquote": "
{text}", +} + +def utf16_index_map(s: str) -> List[int]: + """ + Map UTF-16 code unit index -> Python string index. + Result length = utf16_len + 1 + """ + mapping = [0] + u16 = 0 + for i, ch in enumerate(s): + code = ord(ch) + u16 += 2 if code > 0xFFFF else 1 + while len(mapping) <= u16: + mapping.append(i + 1) + return mapping + +def apply_template(entity, inner: str, custom_subs: Optional[Dict[str, str]]) -> str: + t = entity.type + if t in ENTITY_TEMPLATES_CG: + tpl = ENTITY_TEMPLATES_CG[t] + elif custom_subs and t in custom_subs: + tpl = custom_subs[t] + else: + return inner + + data = {"text": inner} + + if t == "text_link": + data["url"] = getattr(entity, "url", "") + elif t == "text_mention": + data["url"] = f"tg://user?id={getattr(entity, 'user', {}).id if getattr(entity, 'user', None) else ''}" + elif t == "custom_emoji": + data["custom_emoji_id"] = getattr(entity, "custom_emoji_id", "") + elif (t == "pre") and getattr(entity, "language", None): + data["text"] = f'
{inner}'
+
+ return tpl.format(**data)
+
+def build_tree(entities: List, mapping: List[int]):
+ nodes = []
+
+ for e in entities:
+ start16 = e.offset
+ end16 = e.offset + e.length
+
+ start = mapping[start16]
+ end = mapping[end16]
+
+ nodes.append({
+ "entity": e,
+ "start": start,
+ "end": end,
+ "children": []
+ })
+
+ nodes.sort(key=lambda node: (node["start"], -node["end"]))
+
+ stack = []
+ roots = []
+
+ for n in nodes:
+ while stack and n["start"] >= stack[-1]["end"]:
+ stack.pop()
+
+ if stack:
+ stack[-1]["children"].append(n)
+ else:
+ roots.append(n)
+
+ stack.append(n)
+
+ return roots
+
+def render(text: str, nodes, custom_subs):
+ result = []
+ pos = 0
+
+ for n in nodes:
+ result.append(text[pos:n["start"]])
+
+ inner = render(
+ text[n["start"]:n["end"]],
+ shift_nodes(n["children"], n["start"]),
+ custom_subs
+ )
+
+ wrapped = apply_template(n["entity"], inner, custom_subs)
+ result.append(wrapped)
+
+ pos = n["end"]
+
+ result.append(text[pos:])
+ return "".join(result)
+
+def shift_nodes(nodes, shift):
+ out = []
+ for n in nodes:
+ out.append({
+ "entity": n["entity"],
+ "start": n["start"] - shift,
+ "end": n["end"] - shift,
+ "children": shift_nodes(n["children"], shift),
+ })
+ return out
+
+def apply_html_entities_cg(
+ text: str,
+ entities: Optional[List],
+ custom_subs: Optional[Dict[str, str]]
+) -> str:
+ if not text:
+ return text
+ elif not entities:
+ return text.replace("&", "&").replace("<", "<").replace(">", ">")
+
+ mapping = utf16_index_map(text)
+ tree = build_tree(entities, mapping)
+ return render(text, tree, custom_subs)
+#endregion
+
+def apply_html_entities_coder(text: str, entities=None, custom_subs=None) -> str:
+ """
+ Apply HTML formatting to text based on provided entities.
+ Handles nested and overlapping entities correctly.
+ """
+ if not entities:
+ return text.replace("&", "&").replace("<", "<").replace(">", ">")
+
+ _subs = {
+ "bold": "{text}",
+ "italic": "{text}",
+ "pre": "{text}",
+ "code": "{text}",
+ "text_link": "{text}",
+ "strikethrough": "{text}", + "expandable_blockquote": "
{text}", + } + + if custom_subs: + for key, value in custom_subs.items(): + _subs[key] = value + + # Sort entities by offset (starting position), with longer entities first for equal offsets + sorted_entities = sorted(entities, key=lambda e: (e.offset, -e.length)) + + # Convert text to utf-16 encoding for proper handling + utf16_text = text.encode("utf-16-le") + + def escape_html(text_part): + """Escape HTML special characters in a text part""" + if isinstance(text_part, bytes): + text_part = text_part.decode("utf-16-le") + return text_part.replace("&", "&").replace("<", "<").replace(">", ">") + + def format_entity(entity, content): + """Apply entity formatting to the content""" + entity_type = entity.type + + # Handle different entity types + if entity_type == "text_mention" and hasattr(entity, 'user'): + return f"{content}" + # elif entity_type == "mention": # No need to do this, @username works fine + # username = content[1:] # Remove @ symbol + # return f"{content}" + elif entity_type == "text_link" and hasattr(entity, 'url'): + return f"{content}" + elif entity_type == "custom_emoji" and hasattr(entity, 'custom_emoji_id'): + return f"
{content}"
+ elif entity_type in _subs:
+ template = _subs[entity_type]
+ return template.format(text=content)
+
+ # If no matching entity type, return text as is
+ return content
+
+ def process_entities(byte_text, entity_list, start_pos=0, end_pos=None):
+ if end_pos is None:
+ end_pos = len(byte_text)
+
+ if not entity_list or start_pos >= end_pos:
+ return escape_html(byte_text[start_pos:end_pos])
+
+ current_entity = entity_list[0]
+ current_start = current_entity.offset * 2
+ current_end = current_start + current_entity.length * 2
+
+ if current_end <= start_pos or current_start >= end_pos:
+ return escape_html(byte_text[start_pos:end_pos])
+
+ result = []
+
+ if current_start > start_pos:
+ result.append(escape_html(byte_text[start_pos:current_start]))
+
+ nested_entities = []
+ remaining_entities = []
+
+ for entity in entity_list[1:]:
+ entity_start = entity.offset * 2
+ # entity_end = entity_start + entity.length * 2
+
+ if entity_start >= current_start and entity_start < current_end:
+ nested_entities.append(entity)
+ else:
+ remaining_entities.append(entity)
+
+ if nested_entities:
+ inner_content = process_entities(
+ byte_text,
+ nested_entities,
+ current_start,
+ current_end
+ )
+ else:
+ inner_content = escape_html(byte_text[current_start:current_end])
+
+ result.append(format_entity(current_entity, inner_content))
+
+ if current_end < end_pos and remaining_entities:
+ result.append(process_entities(
+ byte_text,
+ remaining_entities,
+ current_end,
+ end_pos
+ ))
+ elif current_end < end_pos:
+ result.append(escape_html(byte_text[current_end:end_pos]))
+
+ return "".join(result)
+
+ html_result = process_entities(utf16_text, sorted_entities)
+
+ return html_result