diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py index ef920633..b88ae4b2 100644 --- a/changedetectionio/content_fetchers/playwright.py +++ b/changedetectionio/content_fetchers/playwright.py @@ -59,7 +59,10 @@ def capture_full_page(page): p.join() logger.debug( f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s") - + # Explicit cleanup + del screenshot_chunks + del p + del parent_conn, child_conn screenshot_chunks = None return screenshot @@ -286,12 +289,28 @@ class fetcher(Fetcher): pass # Clean up resources properly - context.close() - context = None + try: + self.page.request_gc() + except: + pass - self.page.close() + try: + self.page.close() + except: + pass self.page = None - browser.close() - borwser = None + try: + context.close() + except: + pass + context = None + + try: + browser.close() + except: + pass + browser = None + + diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 3250db95..ddd5cbef 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -435,7 +435,9 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False return re.sub(pattern, repl, html_content) -def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str: + +def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False): + from inscriptis import get_text from inscriptis.model.config import ParserConfig @@ -470,9 +472,19 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals html_content = re.sub(r'', r'', html_content) text_content = get_text(html_content, config=parser_config) + conn.send(text_content) + conn.close() - return text_content +# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON +def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False): + from multiprocessing import Process, Pipe + parent_conn, child_conn = Pipe() + p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss)) + p.start() + text = parent_conn.recv() + p.join() + return text # Does LD+JSON exist with a @type=='product' and a .price set anywhere? def has_ldjson_product_info(content):