feat: enhance URL comparison logic and Chrome debugging configuration

- Added a new function to ensure URLs have a scheme, defaulting to 'http://' if missing.
- Integrated tldextract to normalize URLs by extracting domain parts and handling 'www' subdomains.
- Updated the compare_urls function to include logging for better traceability during URL comparisons.
- Added tldextract to requirements.txt to support the new functionality.
- Updated the AWS manager with a new AMI ID for the specified resolution.
- Modified Chrome desktop launcher to include --remote-debugging-port=1337 for GUI debugging support.

These changes improve the robustness of URL handling and enable consistent Chrome debugging capabilities without altering existing logic.
This commit is contained in:
yuanmengqi
2025-07-18 17:55:45 +00:00
parent 1ade6fe439
commit 4fa59ebba2
3 changed files with 48 additions and 16 deletions

View File

@@ -10,7 +10,7 @@ import zipfile
#import pandas as pd #import pandas as pd
from typing import Any, TypeVar, Union, Iterable, Optional, Callable from typing import Any, TypeVar, Union, Iterable, Optional, Callable
from typing import Dict, List, Set, Match, Tuple, Pattern from typing import Dict, List, Set, Match, Tuple, Pattern
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse, ParseResult
import formulas import formulas
import lxml.cssselect import lxml.cssselect
@@ -29,6 +29,7 @@ from openpyxl.worksheet.cell_range import MultiCellRange, CellRange
from openpyxl.worksheet.dimensions import DimensionHolder from openpyxl.worksheet.dimensions import DimensionHolder
from openpyxl.worksheet.filters import AutoFilter, SortState from openpyxl.worksheet.filters import AutoFilter, SortState
from openpyxl.worksheet.worksheet import Worksheet from openpyxl.worksheet.worksheet import Worksheet
import tldextract
V = TypeVar("Value") V = TypeVar("Value")
@@ -699,29 +700,59 @@ def are_lists_equal(list1, list2, comparison_func):
return True return True
def compare_urls(url1, url2): def compare_urls(url1, url2, full=True):
if url1 is None or url2 is None: if url1 is None or url2 is None:
return url1 == url2 return url1 == url2
logger.info(f"compare_urls. url1: {url1}; url2: {url2}")
def parse_with_default_scheme(url):
"""
Ensure the URL has a scheme. If not, prepend 'http://'
so it parses as host + path instead of just a path.
"""
# Regex to check if URL has scheme like 'http://', 'https://', etc.
if not re.match(r'^[a-zA-Z][a-zA-Z0-9+\-.]*://', url):
url = f"http://{url}"
return urlparse(url)
def normalize_url(url): def normalize_url(url):
# Parse the URL # Parse the URL; if no scheme is present, assume 'http'
parsed_url = urlparse(url) parsed_url = parse_with_default_scheme(url)
scheme = parsed_url.scheme.lower()
# If no scheme is present, assume 'http' # Extract the domain parts using tldextract
scheme = parsed_url.scheme if parsed_url.scheme else 'http' extracted = tldextract.extract(parsed_url.netloc.lower())
# e.g., extracted = TLDExtractResult(subdomain='www', domain='airbnb', suffix='com.sg')
# Lowercase the scheme and netloc, remove 'www.', and handle trailing slash # Drop 'www' if it's the only subdomain
normalized_netloc = parsed_url.netloc.lower().replace("www.", "") subdomain = extracted.subdomain
if subdomain == 'www':
subdomain = ''
# Instead of using the suffix (e.g., 'com', 'com.sg'), ignore it completely
# so that both 'airbnb.com' and 'airbnb.com.sg' become just 'airbnb' or 'www.airbnb'
if subdomain:
normalized_netloc = f"{subdomain}.{extracted.domain}"
else:
normalized_netloc = extracted.domain
# Handle trailing slash in the path
normalized_path = parsed_url.path if parsed_url.path != '/' else '' normalized_path = parsed_url.path if parsed_url.path != '/' else ''
# Reassemble the URL with normalized components # Reassemble the URL with the normalized components
normalized_parsed_url = parsed_url._replace(scheme=scheme.lower(), netloc=normalized_netloc, normalized_parsed_url = ParseResult(
path=normalized_path) scheme=scheme.lower(),
normalized_url = urlunparse(normalized_parsed_url) netloc=normalized_netloc,
path=normalized_path,
params=parsed_url.params if full else '', # Keep the params
query=parsed_url.query if full else '', # Keep the query string
fragment=parsed_url.fragment if full else '', # Keep the fragment
)
return urlunparse(normalized_parsed_url)
return normalized_url logger.info(f"After normalization. url1: {normalize_url(url1)}; url2: {normalize_url(url2)}")
# Normalize both URLs
# Normalize both URLs for comparison
norm_url1 = normalize_url(url1) norm_url1 = normalize_url(url1)
norm_url2 = normalize_url(url2) norm_url2 = normalize_url(url2)

View File

@@ -37,7 +37,7 @@ DEFAULT_REGION = "us-east-1"
# todo: public the AMI images # todo: public the AMI images
IMAGE_ID_MAP = { IMAGE_ID_MAP = {
"us-east-1": { "us-east-1": {
(1920, 1080): "ami-09138bff939f82bd8" (1920, 1080): "ami-0d23263edb96951d8"
}, },
"ap-east-1": { "ap-east-1": {
(1920, 1080): "ami-0c092a5b8be4116f5" (1920, 1080): "ami-0c092a5b8be4116f5"

View File

@@ -62,3 +62,4 @@ azure-mgmt-network
docker docker
loguru loguru
dotenv dotenv
tldextract