Memory fixes for large playwright screenshots (#3092)

This commit is contained in:
Sean Kelly
2025-04-09 08:02:44 -07:00
committed by GitHub
parent 4e6e680d79
commit 9f326783e5
4 changed files with 88 additions and 64 deletions

View File

@@ -4,7 +4,7 @@ import re
from random import randint from random import randint
from loguru import logger from loguru import logger
from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD from changedetectionio.content_fetchers.helpers import capture_full_page
from changedetectionio.content_fetchers.base import manage_user_agent from changedetectionio.content_fetchers.base import manage_user_agent
from changedetectionio.safe_jinja import render as jinja_render from changedetectionio.safe_jinja import render as jinja_render
@@ -298,14 +298,7 @@ class browsersteps_live_ui(steppable_browser_interface):
now = time.time() now = time.time()
self.page.wait_for_timeout(1 * 1000) self.page.wait_for_timeout(1 * 1000)
screenshot = capture_full_page(self.page)
full_height = self.page.evaluate("document.documentElement.scrollHeight")
if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD:
logger.warning(f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.")
screenshot = capture_stitched_together_full_page(self.page)
else:
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=40)
logger.debug(f"Time to get screenshot from browser {time.time() - now:.2f}s") logger.debug(f"Time to get screenshot from browser {time.time() - now:.2f}s")

View File

@@ -1,79 +1,107 @@
# Pages with a vertical height longer than this will use the 'stitch together' method. # Pages with a vertical height longer than this will use the 'stitch together' method.
# - Many GPUs have a max texture size of 16384x16384px (or lower on older devices). # - Many GPUs have a max texture size of 16384x16384px (or lower on older devices).
# - If a page is taller than ~800010000px, it risks exceeding GPU memory limits. # - If a page is taller than ~800010000px, it risks exceeding GPU memory limits.
# - This is especially important on headless Chromium, where Playwright may fail to allocate a massive full-page buffer. # - This is especially important on headless Chromium, where Playwright may fail to allocate a massive full-page buffer.
# The size at which we will switch to stitching method
SCREENSHOT_SIZE_STITCH_THRESHOLD=8000
from loguru import logger from loguru import logger
def capture_stitched_together_full_page(page): def capture_full_page(page):
import io import io
import os import os
import time import time
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
MAX_TOTAL_HEIGHT = SCREENSHOT_SIZE_STITCH_THRESHOLD*4 # Maximum total height for the final image (When in stitch mode) # Maximum total height for the final image (When in stitch mode).
MAX_CHUNK_HEIGHT = 4000 # Height per screenshot chunk # We limit this to 16000px due to the huge amount of RAM that was being used
# Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc)
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", 16000))
# The size at which we will switch to stitching method, when below this (and
# MAX_TOTAL_HEIGHT which can be set by a user) we will use the default
# screenshot method.
SCREENSHOT_SIZE_STITCH_THRESHOLD = 8000
WARNING_TEXT_HEIGHT = 20 # Height of the warning text overlay WARNING_TEXT_HEIGHT = 20 # Height of the warning text overlay
# Save the original viewport size # Save the original viewport size
original_viewport = page.viewport_size original_viewport = page.viewport_size
now = time.time() start = time.time()
stitched_image = None
try: try:
viewport = page.viewport_size viewport_width = original_viewport["width"]
viewport_height = original_viewport["height"]
page_height = page.evaluate("document.documentElement.scrollHeight") page_height = page.evaluate("document.documentElement.scrollHeight")
# Optimization to avoid unnecessary stitching if we can avoid it
# Use the default screenshot method for smaller pages to take advantage
# of GPU and native playwright screenshot optimizations
if (
page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD
and page_height < MAX_TOTAL_HEIGHT
):
logger.debug("Using default screenshot method")
screenshot = page.screenshot(
type="jpeg",
quality=int(os.getenv("SCREENSHOT_QUALITY", 30)),
full_page=True,
)
logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
return screenshot
logger.debug(
"Using stitching method for large screenshot because page height exceeds threshold"
)
# Limit the total capture height # Limit the total capture height
capture_height = min(page_height, MAX_TOTAL_HEIGHT) capture_height = min(page_height, MAX_TOTAL_HEIGHT)
images = [] # Calculate number of chunks needed using ORIGINAL viewport height
total_captured_height = 0 num_chunks = (capture_height + viewport_height - 1) // viewport_height
for offset in range(0, capture_height, MAX_CHUNK_HEIGHT): # Create the final image upfront to avoid holding all chunks in memory
# Ensure we do not exceed the total height limit stitched_image = Image.new("RGB", (viewport_width, capture_height))
chunk_height = min(MAX_CHUNK_HEIGHT, MAX_TOTAL_HEIGHT - total_captured_height)
# Adjust viewport size for this chunk # Track cumulative paste position
page.set_viewport_size({"width": viewport["width"], "height": chunk_height})
# Scroll to the correct position
page.evaluate(f"window.scrollTo(0, {offset})")
# Capture screenshot chunk
screenshot_bytes = page.screenshot(type='jpeg', quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
images.append(Image.open(io.BytesIO(screenshot_bytes)))
total_captured_height += chunk_height
# Stop if we reached the maximum total height
if total_captured_height >= MAX_TOTAL_HEIGHT:
break
# Create the final stitched image
stitched_image = Image.new('RGB', (viewport["width"], total_captured_height))
y_offset = 0 y_offset = 0
# Stitch the screenshot chunks together for _ in range(num_chunks):
for img in images: # Scroll to position (no viewport resizing)
page.evaluate(f"window.scrollTo(0, {y_offset})")
# Capture only the visible area using clip
with io.BytesIO(
page.screenshot(
type="jpeg",
clip={
"x": 0,
"y": 0,
"width": viewport_width,
"height": min(viewport_height, capture_height - y_offset),
},
quality=int(os.getenv("SCREENSHOT_QUALITY", 30)),
)
) as buf:
with Image.open(buf) as img:
img.load()
stitched_image.paste(img, (0, y_offset)) stitched_image.paste(img, (0, y_offset))
y_offset += img.height y_offset += img.height
logger.debug(f"Screenshot stitched together in {time.time()-now:.2f}s") logger.debug(f"Screenshot stitched together in {time.time() - start:.2f}s")
# Overlay warning text if the screenshot was trimmed # Overlay warning text if the screenshot was trimmed
if page_height > MAX_TOTAL_HEIGHT: if capture_height < page_height:
draw = ImageDraw.Draw(stitched_image) draw = ImageDraw.Draw(stitched_image)
warning_text = f"WARNING: Screenshot was {page_height}px but trimmed to {MAX_TOTAL_HEIGHT}px because it was too long" warning_text = f"WARNING: Screenshot was {page_height}px but trimmed to {MAX_TOTAL_HEIGHT}px because it was too long"
# Load font (default system font if Arial is unavailable) # Load font (default system font if Arial is unavailable)
try: try:
font = ImageFont.truetype("arial.ttf", WARNING_TEXT_HEIGHT) # Arial (Windows/Mac) font = ImageFont.truetype(
"arial.ttf", WARNING_TEXT_HEIGHT
) # Arial (Windows/Mac)
except IOError: except IOError:
font = ImageFont.load_default() # Default font if Arial not found font = ImageFont.load_default() # Default font if Arial not found
@@ -83,22 +111,28 @@ def capture_stitched_together_full_page(page):
text_height = text_bbox[3] - text_bbox[1] # Calculate text height text_height = text_bbox[3] - text_bbox[1] # Calculate text height
# Define background rectangle (top of the image) # Define background rectangle (top of the image)
draw.rectangle([(0, 0), (viewport["width"], WARNING_TEXT_HEIGHT)], fill="white") draw.rectangle(
[(0, 0), (viewport_width, WARNING_TEXT_HEIGHT)], fill="white"
)
# Center text horizontally within the warning area # Center text horizontally within the warning area
text_x = (viewport["width"] - text_width) // 2 text_x = (viewport_width - text_width) // 2
text_y = (WARNING_TEXT_HEIGHT - text_height) // 2 text_y = (WARNING_TEXT_HEIGHT - text_height) // 2
# Draw the warning text in red # Draw the warning text in red
draw.text((text_x, text_y), warning_text, fill="red", font=font) draw.text((text_x, text_y), warning_text, fill="red", font=font)
# Save or return the final image # Save final image
output = io.BytesIO() with io.BytesIO() as output:
stitched_image.save(output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", 30))) stitched_image.save(
output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", 30))
)
screenshot = output.getvalue() screenshot = output.getvalue()
finally: finally:
# Restore the original viewport size # Restore the original viewport size
page.set_viewport_size(original_viewport) page.set_viewport_size(original_viewport)
if stitched_image is not None:
stitched_image.close()
return screenshot return screenshot

View File

@@ -4,7 +4,7 @@ from urllib.parse import urlparse
from loguru import logger from loguru import logger
from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD from changedetectionio.content_fetchers.helpers import capture_full_page
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
@@ -204,14 +204,7 @@ class fetcher(Fetcher):
# acceptable screenshot quality here # acceptable screenshot quality here
try: try:
# The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage # The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage
full_height = self.page.evaluate("document.documentElement.scrollHeight") self.screenshot = capture_full_page(self.page)
if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD:
logger.warning(
f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.")
self.screenshot = capture_stitched_together_full_page(self.page)
else:
self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
except Exception as e: except Exception as e:
# It's likely the screenshot was too long/big and something crashed # It's likely the screenshot was too long/big and something crashed

View File

@@ -63,6 +63,10 @@ services:
# #
# A valid timezone name to run as (for scheduling watch checking) see https://en.wikipedia.org/wiki/List_of_tz_database_time_zones # A valid timezone name to run as (for scheduling watch checking) see https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
# - TZ=America/Los_Angeles # - TZ=America/Los_Angeles
#
# Maximum height of screenshots, default is 16000 px, screenshots will be clipped to this if exceeded.
# RAM usage will be higher if you increase this.
# - SCREENSHOT_MAX_HEIGHT=16000
# Comment out ports: when using behind a reverse proxy , enable networks: etc. # Comment out ports: when using behind a reverse proxy , enable networks: etc.
ports: ports: