diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 668f5ca0..69807f35 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -437,60 +437,26 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False # NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON -import os -def html_to_text_sub_worker(temp_file_path, html_content, render_anchor_tag_content=False, is_rss=False): + +def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str: from inscriptis import get_text from inscriptis.model.config import ParserConfig - try: - if render_anchor_tag_content: - parser_config = ParserConfig( - annotation_rules={"a": ["hyperlink"]}, - display_links=True - ) - else: - parser_config = None - if is_rss: - html_content = re.sub(r'])', r'', r'', html_content) + if render_anchor_tag_content: + parser_config = ParserConfig( + annotation_rules={"a": ["hyperlink"]}, + display_links=True + ) + else: + parser_config = None - text_content = get_text(html_content, config=parser_config) + if is_rss: + html_content = re.sub(r'])', r'', r'', html_content) - with open(temp_file_path, "w", encoding="utf-8") as f: - f.write(text_content) - - except Exception as e: - # Write error to file so the parent can read it - with open(temp_file_path, "w", encoding="utf-8") as f: - f.write(f"[ERROR] {e}") - -import tempfile -from multiprocessing import Process -def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str: - - - with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp_file: - temp_file_path = tmp_file.name - - p = Process( - target=html_to_text_sub_worker, - args=(temp_file_path, html_content, render_anchor_tag_content, is_rss) - ) - p.start() - p.join(timeout) - - if p.is_alive(): - p.terminate() - p.join() - - try: - with open(temp_file_path, "r", encoding="utf-8") as f: - result = f.read() - finally: - os.remove(temp_file_path) - - return result + text_content = get_text(html_content, config=parser_config) + return text_content # Does LD+JSON exist with a @type=='product' and a .price set anywhere? def has_ldjson_product_info(content): diff --git a/changedetectionio/run_basic_tests.sh b/changedetectionio/run_basic_tests.sh index 7e13b766..2f179fda 100755 --- a/changedetectionio/run_basic_tests.sh +++ b/changedetectionio/run_basic_tests.sh @@ -38,6 +38,9 @@ pytest tests/test_backend.py pytest tests/test_rss.py pytest tests/test_unique_lines.py +# Try high concurrency +FETCH_WORKERS=130 pytest tests/test_history_consistency.py -v -l + # Check file:// will pickup a file when enabled echo "Hello world" > /tmp/test-file.txt ALLOW_FILE_URI=yes pytest tests/test_security.py diff --git a/changedetectionio/tests/test_history_consistency.py b/changedetectionio/tests/test_history_consistency.py index 9e58e201..4b240312 100644 --- a/changedetectionio/tests/test_history_consistency.py +++ b/changedetectionio/tests/test_history_consistency.py @@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs def test_consistent_history(client, live_server, measure_memory_usage): live_server_setup(live_server) - - r = range(1, 30) + workers = int(os.getenv("FETCH_WORKERS", 10)) + r = range(1, 10+workers) for one in r: test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)