Memory fixes for large playwright screenshots (#3092)

This commit is contained in:
Sean Kelly
2025-04-09 08:02:44 -07:00
committed by GitHub
parent 4e6e680d79
commit 9f326783e5
4 changed files with 88 additions and 64 deletions

View File

@@ -4,7 +4,7 @@ import re
from random import randint
from loguru import logger
from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD
from changedetectionio.content_fetchers.helpers import capture_full_page
from changedetectionio.content_fetchers.base import manage_user_agent
from changedetectionio.safe_jinja import render as jinja_render
@@ -298,14 +298,7 @@ class browsersteps_live_ui(steppable_browser_interface):
now = time.time()
self.page.wait_for_timeout(1 * 1000)
full_height = self.page.evaluate("document.documentElement.scrollHeight")
if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD:
logger.warning(f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.")
screenshot = capture_stitched_together_full_page(self.page)
else:
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=40)
screenshot = capture_full_page(self.page)
logger.debug(f"Time to get screenshot from browser {time.time() - now:.2f}s")

View File

@@ -1,79 +1,107 @@
# Pages with a vertical height longer than this will use the 'stitch together' method.
# - Many GPUs have a max texture size of 16384x16384px (or lower on older devices).
# - If a page is taller than ~800010000px, it risks exceeding GPU memory limits.
# - This is especially important on headless Chromium, where Playwright may fail to allocate a massive full-page buffer.
# The size at which we will switch to stitching method
SCREENSHOT_SIZE_STITCH_THRESHOLD=8000
from loguru import logger
def capture_stitched_together_full_page(page):
def capture_full_page(page):
import io
import os
import time
from PIL import Image, ImageDraw, ImageFont
MAX_TOTAL_HEIGHT = SCREENSHOT_SIZE_STITCH_THRESHOLD*4 # Maximum total height for the final image (When in stitch mode)
MAX_CHUNK_HEIGHT = 4000 # Height per screenshot chunk
# Maximum total height for the final image (When in stitch mode).
# We limit this to 16000px due to the huge amount of RAM that was being used
# Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc)
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", 16000))
# The size at which we will switch to stitching method, when below this (and
# MAX_TOTAL_HEIGHT which can be set by a user) we will use the default
# screenshot method.
SCREENSHOT_SIZE_STITCH_THRESHOLD = 8000
WARNING_TEXT_HEIGHT = 20 # Height of the warning text overlay
# Save the original viewport size
original_viewport = page.viewport_size
now = time.time()
start = time.time()
stitched_image = None
try:
viewport = page.viewport_size
viewport_width = original_viewport["width"]
viewport_height = original_viewport["height"]
page_height = page.evaluate("document.documentElement.scrollHeight")
# Optimization to avoid unnecessary stitching if we can avoid it
# Use the default screenshot method for smaller pages to take advantage
# of GPU and native playwright screenshot optimizations
if (
page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD
and page_height < MAX_TOTAL_HEIGHT
):
logger.debug("Using default screenshot method")
screenshot = page.screenshot(
type="jpeg",
quality=int(os.getenv("SCREENSHOT_QUALITY", 30)),
full_page=True,
)
logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
return screenshot
logger.debug(
"Using stitching method for large screenshot because page height exceeds threshold"
)
# Limit the total capture height
capture_height = min(page_height, MAX_TOTAL_HEIGHT)
images = []
total_captured_height = 0
# Calculate number of chunks needed using ORIGINAL viewport height
num_chunks = (capture_height + viewport_height - 1) // viewport_height
for offset in range(0, capture_height, MAX_CHUNK_HEIGHT):
# Ensure we do not exceed the total height limit
chunk_height = min(MAX_CHUNK_HEIGHT, MAX_TOTAL_HEIGHT - total_captured_height)
# Create the final image upfront to avoid holding all chunks in memory
stitched_image = Image.new("RGB", (viewport_width, capture_height))
# Adjust viewport size for this chunk
page.set_viewport_size({"width": viewport["width"], "height": chunk_height})
# Scroll to the correct position
page.evaluate(f"window.scrollTo(0, {offset})")
# Capture screenshot chunk
screenshot_bytes = page.screenshot(type='jpeg', quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
images.append(Image.open(io.BytesIO(screenshot_bytes)))
total_captured_height += chunk_height
# Stop if we reached the maximum total height
if total_captured_height >= MAX_TOTAL_HEIGHT:
break
# Create the final stitched image
stitched_image = Image.new('RGB', (viewport["width"], total_captured_height))
# Track cumulative paste position
y_offset = 0
# Stitch the screenshot chunks together
for img in images:
stitched_image.paste(img, (0, y_offset))
y_offset += img.height
for _ in range(num_chunks):
# Scroll to position (no viewport resizing)
page.evaluate(f"window.scrollTo(0, {y_offset})")
logger.debug(f"Screenshot stitched together in {time.time()-now:.2f}s")
# Capture only the visible area using clip
with io.BytesIO(
page.screenshot(
type="jpeg",
clip={
"x": 0,
"y": 0,
"width": viewport_width,
"height": min(viewport_height, capture_height - y_offset),
},
quality=int(os.getenv("SCREENSHOT_QUALITY", 30)),
)
) as buf:
with Image.open(buf) as img:
img.load()
stitched_image.paste(img, (0, y_offset))
y_offset += img.height
logger.debug(f"Screenshot stitched together in {time.time() - start:.2f}s")
# Overlay warning text if the screenshot was trimmed
if page_height > MAX_TOTAL_HEIGHT:
if capture_height < page_height:
draw = ImageDraw.Draw(stitched_image)
warning_text = f"WARNING: Screenshot was {page_height}px but trimmed to {MAX_TOTAL_HEIGHT}px because it was too long"
# Load font (default system font if Arial is unavailable)
try:
font = ImageFont.truetype("arial.ttf", WARNING_TEXT_HEIGHT) # Arial (Windows/Mac)
font = ImageFont.truetype(
"arial.ttf", WARNING_TEXT_HEIGHT
) # Arial (Windows/Mac)
except IOError:
font = ImageFont.load_default() # Default font if Arial not found
@@ -83,22 +111,28 @@ def capture_stitched_together_full_page(page):
text_height = text_bbox[3] - text_bbox[1] # Calculate text height
# Define background rectangle (top of the image)
draw.rectangle([(0, 0), (viewport["width"], WARNING_TEXT_HEIGHT)], fill="white")
draw.rectangle(
[(0, 0), (viewport_width, WARNING_TEXT_HEIGHT)], fill="white"
)
# Center text horizontally within the warning area
text_x = (viewport["width"] - text_width) // 2
text_x = (viewport_width - text_width) // 2
text_y = (WARNING_TEXT_HEIGHT - text_height) // 2
# Draw the warning text in red
draw.text((text_x, text_y), warning_text, fill="red", font=font)
# Save or return the final image
output = io.BytesIO()
stitched_image.save(output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
screenshot = output.getvalue()
# Save final image
with io.BytesIO() as output:
stitched_image.save(
output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", 30))
)
screenshot = output.getvalue()
finally:
# Restore the original viewport size
page.set_viewport_size(original_viewport)
if stitched_image is not None:
stitched_image.close()
return screenshot

View File

@@ -4,7 +4,7 @@ from urllib.parse import urlparse
from loguru import logger
from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD
from changedetectionio.content_fetchers.helpers import capture_full_page
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
@@ -204,14 +204,7 @@ class fetcher(Fetcher):
# acceptable screenshot quality here
try:
# The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage
full_height = self.page.evaluate("document.documentElement.scrollHeight")
if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD:
logger.warning(
f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.")
self.screenshot = capture_stitched_together_full_page(self.page)
else:
self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 30)))
self.screenshot = capture_full_page(self.page)
except Exception as e:
# It's likely the screenshot was too long/big and something crashed

View File

@@ -63,6 +63,10 @@ services:
#
# A valid timezone name to run as (for scheduling watch checking) see https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
# - TZ=America/Los_Angeles
#
# Maximum height of screenshots, default is 16000 px, screenshots will be clipped to this if exceeded.
# RAM usage will be higher if you increase this.
# - SCREENSHOT_MAX_HEIGHT=16000
# Comment out ports: when using behind a reverse proxy , enable networks: etc.
ports: