Merge branch 'revert-multiprocess-htmlt-to-text' into realtime-ui

2025-05-08 18:17:17 +02:00
parent c2b02d61ba e38f264750
commit 4eb9b76f6d
2 changed files with 47 additions and 42 deletions
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -436,55 +436,61 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
    return re.sub(pattern, repl, html_content)
-def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
+# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
 import os
 def html_to_text_sub_worker(temp_file_path, html_content, render_anchor_tag_content=False, is_rss=False):
    from inscriptis import get_text
    from inscriptis.model.config import ParserConfig
    try:
        if render_anchor_tag_content:
            parser_config = ParserConfig(
                annotation_rules={"a": ["hyperlink"]},
                display_links=True
            )
        else:
            parser_config = None
-    """Converts html string to a string with just the text. If ignoring
+        if is_rss:
-    rendering anchor tag content is enable, anchor tag content are also
+            html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
-    included in the text
+            html_content = re.sub(r'</title>', r'</h1>', html_content)
-    :param html_content: string with html content
+        text_content = get_text(html_content, config=parser_config)
    :param render_anchor_tag_content: boolean flag indicating whether to extract
    hyperlinks (the anchor tag content) together with text. This refers to the
    'href' inside 'a' tags.
    Anchor tag content is rendered in the following manner:
    '[ text ](anchor tag content)'
    :return: extracted text from the HTML
    """
    #  if anchor tag content flag is set to True define a config for
    #  extracting this content
    if render_anchor_tag_content:
        parser_config = ParserConfig(
            annotation_rules={"a": ["hyperlink"]},
            display_links=True
        )
    # otherwise set config to None/default
    else:
        parser_config = None
-    # RSS Mode - Inscriptis will treat `title` as something else.
+        with open(temp_file_path, "w", encoding="utf-8") as f:
-    # Make it as a regular block display element (//item/title)
+            f.write(text_content)
    # This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
    if is_rss:
        html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
        html_content = re.sub(r'</title>', r'</h1>', html_content)
-    text_content = get_text(html_content, config=parser_config)
+    except Exception as e:
-    conn.send(text_content)
+        # Write error to file so the parent can read it
-    conn.close()
+        with open(temp_file_path, "w", encoding="utf-8") as f:
            f.write(f"[ERROR] {e}")
-# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
+import tempfile
-def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
+from multiprocessing import Process
-    from multiprocessing import Process, Pipe
+def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
-    parent_conn, child_conn = Pipe()
+
-    p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
+    with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp_file:
        temp_file_path = tmp_file.name
    p = Process(
        target=html_to_text_sub_worker,
        args=(temp_file_path, html_content, render_anchor_tag_content, is_rss)
    )
    p.start()
-    text = parent_conn.recv()
+    p.join(timeout)
-    p.join()
+
-    return text
+    if p.is_alive():
        p.terminate()
        p.join()
    try:
        with open(temp_file_path, "r", encoding="utf-8") as f:
            result = f.read()
    finally:
        os.remove(temp_file_path)
    return result
 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):
--- a/changedetectionio/tests/test_history_consistency.py
+++ b/changedetectionio/tests/test_history_consistency.py
@@ -59,8 +59,8 @@ def test_consistent_history(client, live_server, measure_memory_usage):
            assert len(tmp_history) == 1, "History.txt should contain 1 line"
        # Should be two files,. the history.txt , and the snapshot.txt
-        files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path,
+        files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
-                                                     w))
+
        # Find the snapshot one
        for fname in files_in_watch_dir:
            if fname != 'history.txt' and 'html' not in fname:
@@ -76,7 +76,6 @@ def test_consistent_history(client, live_server, measure_memory_usage):
        assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
    json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
    with open(json_db_file, 'r') as f:
        assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"