From 1ec86bd38dae69efb0814cf13125bde00b7d02c9 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 8 May 2025 18:09:47 +0200 Subject: [PATCH 1/2] Revert multiprocess memory management, was unreliable under high concurrency --- changedetectionio/html_tools.py | 19 +++++-------------- .../tests/test_history_consistency.py | 5 ++--- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index ddd5cbef..19c83b94 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -436,7 +436,10 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False return re.sub(pattern, repl, html_content) -def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False): + + +# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON +def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False): from inscriptis import get_text from inscriptis.model.config import ParserConfig @@ -472,19 +475,7 @@ def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=F html_content = re.sub(r'', r'', html_content) text_content = get_text(html_content, config=parser_config) - conn.send(text_content) - conn.close() - -# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON -def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False): - from multiprocessing import Process, Pipe - - parent_conn, child_conn = Pipe() - p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss)) - p.start() - text = parent_conn.recv() - p.join() - return text + return text_content # Does LD+JSON exist with a @type=='product' and a .price set anywhere? def has_ldjson_product_info(content): diff --git a/changedetectionio/tests/test_history_consistency.py b/changedetectionio/tests/test_history_consistency.py index 13943f6a..8e9335a8 100644 --- a/changedetectionio/tests/test_history_consistency.py +++ b/changedetectionio/tests/test_history_consistency.py @@ -58,8 +58,8 @@ def test_consistent_history(client, live_server, measure_memory_usage): assert len(tmp_history) == 1, "History.txt should contain 1 line" # Should be two files,. the history.txt , and the snapshot.txt - files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, - w)) + files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w)) + # Find the snapshot one for fname in files_in_watch_dir: if fname != 'history.txt' and 'html' not in fname: @@ -75,7 +75,6 @@ def test_consistent_history(client, live_server, measure_memory_usage): assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot" - json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json') with open(json_db_file, 'r') as f: assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved" From e38f264750f4f4588db16a45d299863f5212ad87 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 8 May 2025 18:15:16 +0200 Subject: [PATCH 2/2] Avoid pickling issues --- changedetectionio/html_tools.py | 79 ++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 19c83b94..668f5ca0 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -436,46 +436,61 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False return re.sub(pattern, repl, html_content) - - # NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON -def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False): +import os +def html_to_text_sub_worker(temp_file_path, html_content, render_anchor_tag_content=False, is_rss=False): from inscriptis import get_text from inscriptis.model.config import ParserConfig + try: + if render_anchor_tag_content: + parser_config = ParserConfig( + annotation_rules={"a": ["hyperlink"]}, + display_links=True + ) + else: + parser_config = None - """Converts html string to a string with just the text. If ignoring - rendering anchor tag content is enable, anchor tag content are also - included in the text + if is_rss: + html_content = re.sub(r'])', r'', r'', html_content) - :param html_content: string with html content - :param render_anchor_tag_content: boolean flag indicating whether to extract - hyperlinks (the anchor tag content) together with text. This refers to the - 'href' inside 'a' tags. - Anchor tag content is rendered in the following manner: - '[ text ](anchor tag content)' - :return: extracted text from the HTML - """ - # if anchor tag content flag is set to True define a config for - # extracting this content - if render_anchor_tag_content: - parser_config = ParserConfig( - annotation_rules={"a": ["hyperlink"]}, - display_links=True - ) - # otherwise set config to None/default - else: - parser_config = None + text_content = get_text(html_content, config=parser_config) - # RSS Mode - Inscriptis will treat `title` as something else. - # Make it as a regular block display element (//item/title) - # This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874 - if is_rss: - html_content = re.sub(r'])', r'', r'', html_content) + with open(temp_file_path, "w", encoding="utf-8") as f: + f.write(text_content) - text_content = get_text(html_content, config=parser_config) - return text_content + except Exception as e: + # Write error to file so the parent can read it + with open(temp_file_path, "w", encoding="utf-8") as f: + f.write(f"[ERROR] {e}") + +import tempfile +from multiprocessing import Process +def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str: + + + with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp_file: + temp_file_path = tmp_file.name + + p = Process( + target=html_to_text_sub_worker, + args=(temp_file_path, html_content, render_anchor_tag_content, is_rss) + ) + p.start() + p.join(timeout) + + if p.is_alive(): + p.terminate() + p.join() + + try: + with open(temp_file_path, "r", encoding="utf-8") as f: + result = f.read() + finally: + os.remove(temp_file_path) + + return result # Does LD+JSON exist with a @type=='product' and a .price set anywhere? def has_ldjson_product_info(content):