From 9f326783e5c37fba0a025c22017b3fb61410e317 Mon Sep 17 00:00:00 2001 From: Sean Kelly Date: Wed, 9 Apr 2025 08:02:44 -0700 Subject: [PATCH] Memory fixes for large playwright screenshots (#3092) --- .../blueprint/browser_steps/browser_steps.py | 11 +- changedetectionio/content_fetchers/helpers.py | 126 +++++++++++------- .../content_fetchers/playwright.py | 11 +- docker-compose.yml | 4 + 4 files changed, 88 insertions(+), 64 deletions(-) diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py index 00a30a36..353f98e4 100644 --- a/changedetectionio/blueprint/browser_steps/browser_steps.py +++ b/changedetectionio/blueprint/browser_steps/browser_steps.py @@ -4,7 +4,7 @@ import re from random import randint from loguru import logger -from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD +from changedetectionio.content_fetchers.helpers import capture_full_page from changedetectionio.content_fetchers.base import manage_user_agent from changedetectionio.safe_jinja import render as jinja_render @@ -298,14 +298,7 @@ class browsersteps_live_ui(steppable_browser_interface): now = time.time() self.page.wait_for_timeout(1 * 1000) - - full_height = self.page.evaluate("document.documentElement.scrollHeight") - - if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD: - logger.warning(f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.") - screenshot = capture_stitched_together_full_page(self.page) - else: - screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=40) + screenshot = capture_full_page(self.page) logger.debug(f"Time to get screenshot from browser {time.time() - now:.2f}s") diff --git a/changedetectionio/content_fetchers/helpers.py b/changedetectionio/content_fetchers/helpers.py index 79826dcc..def26ca3 100644 --- a/changedetectionio/content_fetchers/helpers.py +++ b/changedetectionio/content_fetchers/helpers.py @@ -1,79 +1,107 @@ - # Pages with a vertical height longer than this will use the 'stitch together' method. # - Many GPUs have a max texture size of 16384x16384px (or lower on older devices). # - If a page is taller than ~8000–10000px, it risks exceeding GPU memory limits. # - This is especially important on headless Chromium, where Playwright may fail to allocate a massive full-page buffer. - -# The size at which we will switch to stitching method -SCREENSHOT_SIZE_STITCH_THRESHOLD=8000 - from loguru import logger -def capture_stitched_together_full_page(page): +def capture_full_page(page): import io import os import time from PIL import Image, ImageDraw, ImageFont - MAX_TOTAL_HEIGHT = SCREENSHOT_SIZE_STITCH_THRESHOLD*4 # Maximum total height for the final image (When in stitch mode) - MAX_CHUNK_HEIGHT = 4000 # Height per screenshot chunk + # Maximum total height for the final image (When in stitch mode). + # We limit this to 16000px due to the huge amount of RAM that was being used + # Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc) + MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", 16000)) + + # The size at which we will switch to stitching method, when below this (and + # MAX_TOTAL_HEIGHT which can be set by a user) we will use the default + # screenshot method. + SCREENSHOT_SIZE_STITCH_THRESHOLD = 8000 + WARNING_TEXT_HEIGHT = 20 # Height of the warning text overlay # Save the original viewport size original_viewport = page.viewport_size - now = time.time() + start = time.time() + + stitched_image = None try: - viewport = page.viewport_size + viewport_width = original_viewport["width"] + viewport_height = original_viewport["height"] + page_height = page.evaluate("document.documentElement.scrollHeight") + # Optimization to avoid unnecessary stitching if we can avoid it + # Use the default screenshot method for smaller pages to take advantage + # of GPU and native playwright screenshot optimizations + if ( + page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD + and page_height < MAX_TOTAL_HEIGHT + ): + logger.debug("Using default screenshot method") + screenshot = page.screenshot( + type="jpeg", + quality=int(os.getenv("SCREENSHOT_QUALITY", 30)), + full_page=True, + ) + logger.debug(f"Screenshot captured in {time.time() - start:.2f}s") + return screenshot + + logger.debug( + "Using stitching method for large screenshot because page height exceeds threshold" + ) + # Limit the total capture height capture_height = min(page_height, MAX_TOTAL_HEIGHT) - images = [] - total_captured_height = 0 + # Calculate number of chunks needed using ORIGINAL viewport height + num_chunks = (capture_height + viewport_height - 1) // viewport_height - for offset in range(0, capture_height, MAX_CHUNK_HEIGHT): - # Ensure we do not exceed the total height limit - chunk_height = min(MAX_CHUNK_HEIGHT, MAX_TOTAL_HEIGHT - total_captured_height) + # Create the final image upfront to avoid holding all chunks in memory + stitched_image = Image.new("RGB", (viewport_width, capture_height)) - # Adjust viewport size for this chunk - page.set_viewport_size({"width": viewport["width"], "height": chunk_height}) - - # Scroll to the correct position - page.evaluate(f"window.scrollTo(0, {offset})") - - # Capture screenshot chunk - screenshot_bytes = page.screenshot(type='jpeg', quality=int(os.getenv("SCREENSHOT_QUALITY", 30))) - images.append(Image.open(io.BytesIO(screenshot_bytes))) - - total_captured_height += chunk_height - - # Stop if we reached the maximum total height - if total_captured_height >= MAX_TOTAL_HEIGHT: - break - - # Create the final stitched image - stitched_image = Image.new('RGB', (viewport["width"], total_captured_height)) + # Track cumulative paste position y_offset = 0 - # Stitch the screenshot chunks together - for img in images: - stitched_image.paste(img, (0, y_offset)) - y_offset += img.height + for _ in range(num_chunks): + # Scroll to position (no viewport resizing) + page.evaluate(f"window.scrollTo(0, {y_offset})") - logger.debug(f"Screenshot stitched together in {time.time()-now:.2f}s") + # Capture only the visible area using clip + with io.BytesIO( + page.screenshot( + type="jpeg", + clip={ + "x": 0, + "y": 0, + "width": viewport_width, + "height": min(viewport_height, capture_height - y_offset), + }, + quality=int(os.getenv("SCREENSHOT_QUALITY", 30)), + ) + ) as buf: + with Image.open(buf) as img: + img.load() + stitched_image.paste(img, (0, y_offset)) + y_offset += img.height + + logger.debug(f"Screenshot stitched together in {time.time() - start:.2f}s") # Overlay warning text if the screenshot was trimmed - if page_height > MAX_TOTAL_HEIGHT: + if capture_height < page_height: draw = ImageDraw.Draw(stitched_image) warning_text = f"WARNING: Screenshot was {page_height}px but trimmed to {MAX_TOTAL_HEIGHT}px because it was too long" # Load font (default system font if Arial is unavailable) try: - font = ImageFont.truetype("arial.ttf", WARNING_TEXT_HEIGHT) # Arial (Windows/Mac) + font = ImageFont.truetype( + "arial.ttf", WARNING_TEXT_HEIGHT + ) # Arial (Windows/Mac) except IOError: font = ImageFont.load_default() # Default font if Arial not found @@ -83,22 +111,28 @@ def capture_stitched_together_full_page(page): text_height = text_bbox[3] - text_bbox[1] # Calculate text height # Define background rectangle (top of the image) - draw.rectangle([(0, 0), (viewport["width"], WARNING_TEXT_HEIGHT)], fill="white") + draw.rectangle( + [(0, 0), (viewport_width, WARNING_TEXT_HEIGHT)], fill="white" + ) # Center text horizontally within the warning area - text_x = (viewport["width"] - text_width) // 2 + text_x = (viewport_width - text_width) // 2 text_y = (WARNING_TEXT_HEIGHT - text_height) // 2 # Draw the warning text in red draw.text((text_x, text_y), warning_text, fill="red", font=font) - # Save or return the final image - output = io.BytesIO() - stitched_image.save(output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", 30))) - screenshot = output.getvalue() + # Save final image + with io.BytesIO() as output: + stitched_image.save( + output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", 30)) + ) + screenshot = output.getvalue() finally: # Restore the original viewport size page.set_viewport_size(original_viewport) + if stitched_image is not None: + stitched_image.close() return screenshot diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py index 861cea60..e3bbb8b3 100644 --- a/changedetectionio/content_fetchers/playwright.py +++ b/changedetectionio/content_fetchers/playwright.py @@ -4,7 +4,7 @@ from urllib.parse import urlparse from loguru import logger -from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD +from changedetectionio.content_fetchers.helpers import capture_full_page from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable @@ -204,14 +204,7 @@ class fetcher(Fetcher): # acceptable screenshot quality here try: # The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage - full_height = self.page.evaluate("document.documentElement.scrollHeight") - - if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD: - logger.warning( - f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.") - self.screenshot = capture_stitched_together_full_page(self.page) - else: - self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 30))) + self.screenshot = capture_full_page(self.page) except Exception as e: # It's likely the screenshot was too long/big and something crashed diff --git a/docker-compose.yml b/docker-compose.yml index bf21f65a..370709e7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -63,6 +63,10 @@ services: # # A valid timezone name to run as (for scheduling watch checking) see https://en.wikipedia.org/wiki/List_of_tz_database_time_zones # - TZ=America/Los_Angeles + # + # Maximum height of screenshots, default is 16000 px, screenshots will be clipped to this if exceeded. + # RAM usage will be higher if you increase this. + # - SCREENSHOT_MAX_HEIGHT=16000 # Comment out ports: when using behind a reverse proxy , enable networks: etc. ports: