Adding new Visual Selector for choosing the area of the webpage to monitor - playwright/browserless only (#566)
This commit is contained in:
@@ -27,6 +27,117 @@ class Fetcher():
|
||||
status_code = None
|
||||
content = None
|
||||
headers = None
|
||||
|
||||
fetcher_description = "No description"
|
||||
xpath_element_js = """
|
||||
// Include the getXpath script directly, easier than fetching
|
||||
!function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):(e=e||self).getXPath=n()}(this,function(){return function(e){var n=e;if(n&&n.id)return'//*[@id="'+n.id+'"]';for(var o=[];n&&Node.ELEMENT_NODE===n.nodeType;){for(var i=0,r=!1,d=n.previousSibling;d;)d.nodeType!==Node.DOCUMENT_TYPE_NODE&&d.nodeName===n.nodeName&&i++,d=d.previousSibling;for(d=n.nextSibling;d;){if(d.nodeName===n.nodeName){r=!0;break}d=d.nextSibling}o.push((n.prefix?n.prefix+":":"")+n.localName+(i||r?"["+(i+1)+"]":"")),n=n.parentNode}return o.length?"/"+o.reverse().join("/"):""}});
|
||||
|
||||
|
||||
const findUpTag = (el) => {
|
||||
let r = el
|
||||
chained_css = [];
|
||||
depth=0;
|
||||
|
||||
// Strategy 1: Keep going up until we hit an ID tag, imagine it's like #list-widget div h4
|
||||
while (r.parentNode) {
|
||||
if(depth==5) {
|
||||
break;
|
||||
}
|
||||
if('' !==r.id) {
|
||||
chained_css.unshift("#"+r.id);
|
||||
final_selector= chained_css.join('>');
|
||||
// Be sure theres only one, some sites have multiples of the same ID tag :-(
|
||||
if (window.document.querySelectorAll(final_selector).length ==1 ) {
|
||||
return final_selector;
|
||||
}
|
||||
return null;
|
||||
} else {
|
||||
chained_css.unshift(r.tagName.toLowerCase());
|
||||
}
|
||||
r=r.parentNode;
|
||||
depth+=1;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
// @todo - if it's SVG or IMG, go into image diff mode
|
||||
var elements = window.document.querySelectorAll("div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary");
|
||||
var size_pos=[];
|
||||
// after page fetch, inject this JS
|
||||
// build a map of all elements and their positions (maybe that only include text?)
|
||||
var bbox;
|
||||
for (var i = 0; i < elements.length; i++) {
|
||||
bbox = elements[i].getBoundingClientRect();
|
||||
|
||||
// forget really small ones
|
||||
if (bbox['width'] <20 && bbox['height'] < 20 ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes
|
||||
// it should not traverse when we know we can anchor off just an ID one level up etc..
|
||||
// maybe, get current class or id, keep traversing up looking for only class or id until there is just one match
|
||||
|
||||
// 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us.
|
||||
xpath_result=false;
|
||||
|
||||
try {
|
||||
var d= findUpTag(elements[i]);
|
||||
if (d) {
|
||||
xpath_result =d;
|
||||
}
|
||||
} catch (e) {
|
||||
var x=1;
|
||||
}
|
||||
|
||||
// You could swap it and default to getXpath and then try the smarter one
|
||||
// default back to the less intelligent one
|
||||
if (!xpath_result) {
|
||||
xpath_result = getXPath(elements[i]);
|
||||
}
|
||||
if(window.getComputedStyle(elements[i]).visibility === "hidden") {
|
||||
continue;
|
||||
}
|
||||
|
||||
size_pos.push({
|
||||
xpath: xpath_result,
|
||||
width: Math.round(bbox['width']),
|
||||
height: Math.round(bbox['height']),
|
||||
left: Math.floor(bbox['left']),
|
||||
top: Math.floor(bbox['top']),
|
||||
childCount: elements[i].childElementCount
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// inject the current one set in the css_filter, which may be a CSS rule
|
||||
// used for displaying the current one in VisualSelector, where its not one we generated.
|
||||
if (css_filter.length) {
|
||||
// is it xpath?
|
||||
if (css_filter.startsWith('/') ) {
|
||||
q=document.evaluate(css_filter, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
|
||||
} else {
|
||||
q=document.querySelector(css_filter);
|
||||
}
|
||||
bbox = q.getBoundingClientRect();
|
||||
if (bbox && bbox['width'] >0 && bbox['height']>0) {
|
||||
size_pos.push({
|
||||
xpath: css_filter,
|
||||
width: bbox['width'],
|
||||
height: bbox['height'],
|
||||
left: bbox['left'],
|
||||
top: bbox['top'],
|
||||
childCount: q.childElementCount
|
||||
});
|
||||
}
|
||||
}
|
||||
// https://stackoverflow.com/questions/1145850/how-to-get-height-of-entire-document-with-javascript
|
||||
return {'size_pos':size_pos, 'browser_width': window.innerWidth, 'browser_height':document.body.scrollHeight};
|
||||
"""
|
||||
xpath_data = None
|
||||
|
||||
# Will be needed in the future by the VisualSelector, always get this where possible.
|
||||
screenshot = False
|
||||
fetcher_description = "No description"
|
||||
@@ -47,7 +158,8 @@ class Fetcher():
|
||||
request_headers,
|
||||
request_body,
|
||||
request_method,
|
||||
ignore_status_codes=False):
|
||||
ignore_status_codes=False,
|
||||
current_css_filter=None):
|
||||
# Should set self.error, self.status_code and self.content
|
||||
pass
|
||||
|
||||
@@ -128,7 +240,8 @@ class base_html_playwright(Fetcher):
|
||||
request_headers,
|
||||
request_body,
|
||||
request_method,
|
||||
ignore_status_codes=False):
|
||||
ignore_status_codes=False,
|
||||
current_css_filter=None):
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
import playwright._impl._api_types
|
||||
@@ -148,8 +261,8 @@ class base_html_playwright(Fetcher):
|
||||
proxy=self.proxy
|
||||
)
|
||||
page = context.new_page()
|
||||
page.set_viewport_size({"width": 1280, "height": 1024})
|
||||
try:
|
||||
# Bug - never set viewport size BEFORE page.goto
|
||||
response = page.goto(url, timeout=timeout * 1000, wait_until='commit')
|
||||
# Wait_until = commit
|
||||
# - `'commit'` - consider operation to be finished when network response is received and the document started loading.
|
||||
@@ -166,14 +279,27 @@ class base_html_playwright(Fetcher):
|
||||
if len(page.content().strip()) == 0:
|
||||
raise EmptyReply(url=url, status_code=None)
|
||||
|
||||
# Bug 2(?) Set the viewport size AFTER loading the page
|
||||
page.set_viewport_size({"width": 1280, "height": 1024})
|
||||
# Bugish - Let the page redraw/reflow
|
||||
page.set_viewport_size({"width": 1280, "height": 1024})
|
||||
|
||||
self.status_code = response.status
|
||||
self.content = page.content()
|
||||
self.headers = response.all_headers()
|
||||
|
||||
if current_css_filter is not None:
|
||||
page.evaluate("var css_filter='{}'".format(current_css_filter))
|
||||
else:
|
||||
page.evaluate("var css_filter=''")
|
||||
|
||||
self.xpath_data = page.evaluate("async () => {" + self.xpath_element_js + "}")
|
||||
# Bug 3 in Playwright screenshot handling
|
||||
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
|
||||
# JPEG is better here because the screenshots can be very very large
|
||||
page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024})
|
||||
self.screenshot = page.screenshot(type='jpeg', full_page=True, quality=90)
|
||||
self.screenshot = page.screenshot(type='jpeg', full_page=True, quality=92)
|
||||
|
||||
context.close()
|
||||
browser.close()
|
||||
|
||||
@@ -225,7 +351,8 @@ class base_html_webdriver(Fetcher):
|
||||
request_headers,
|
||||
request_body,
|
||||
request_method,
|
||||
ignore_status_codes=False):
|
||||
ignore_status_codes=False,
|
||||
current_css_filter=None):
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||
@@ -245,6 +372,10 @@ class base_html_webdriver(Fetcher):
|
||||
self.quit()
|
||||
raise
|
||||
|
||||
self.driver.set_window_size(1280, 1024)
|
||||
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
||||
self.screenshot = self.driver.get_screenshot_as_png()
|
||||
|
||||
# @todo - how to check this? is it possible?
|
||||
self.status_code = 200
|
||||
# @todo somehow we should try to get this working for WebDriver
|
||||
@@ -254,8 +385,6 @@ class base_html_webdriver(Fetcher):
|
||||
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
|
||||
self.content = self.driver.page_source
|
||||
self.headers = {}
|
||||
self.screenshot = self.driver.get_screenshot_as_png()
|
||||
self.quit()
|
||||
|
||||
# Does the connection to the webdriver work? run a test connection.
|
||||
def is_ready(self):
|
||||
@@ -292,7 +421,8 @@ class html_requests(Fetcher):
|
||||
request_headers,
|
||||
request_body,
|
||||
request_method,
|
||||
ignore_status_codes=False):
|
||||
ignore_status_codes=False,
|
||||
current_css_filter=None):
|
||||
|
||||
proxies={}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user