feat: enhance URL comparison logic and Chrome debugging configuration

- Added a new function to ensure URLs have a scheme, defaulting to 'http://' if missing.
- Integrated tldextract to normalize URLs by extracting domain parts and handling 'www' subdomains.
- Updated the compare_urls function to include logging for better traceability during URL comparisons.
- Added tldextract to requirements.txt to support the new functionality.
- Updated the AWS manager with a new AMI ID for the specified resolution.
- Modified Chrome desktop launcher to include --remote-debugging-port=1337 for GUI debugging support.

These changes improve the robustness of URL handling and enable consistent Chrome debugging capabilities without altering existing logic.
This commit is contained in:
yuanmengqi
2025-07-18 17:55:45 +00:00
parent 1ade6fe439
commit 4fa59ebba2
3 changed files with 48 additions and 16 deletions

View File

@@ -10,7 +10,7 @@ import zipfile
#import pandas as pd
from typing import Any, TypeVar, Union, Iterable, Optional, Callable
from typing import Dict, List, Set, Match, Tuple, Pattern
from urllib.parse import urlparse, urlunparse
from urllib.parse import urlparse, urlunparse, ParseResult
import formulas
import lxml.cssselect
@@ -29,6 +29,7 @@ from openpyxl.worksheet.cell_range import MultiCellRange, CellRange
from openpyxl.worksheet.dimensions import DimensionHolder
from openpyxl.worksheet.filters import AutoFilter, SortState
from openpyxl.worksheet.worksheet import Worksheet
import tldextract
V = TypeVar("Value")
@@ -699,29 +700,59 @@ def are_lists_equal(list1, list2, comparison_func):
return True
def compare_urls(url1, url2):
def compare_urls(url1, url2, full=True):
if url1 is None or url2 is None:
return url1 == url2
logger.info(f"compare_urls. url1: {url1}; url2: {url2}")
def parse_with_default_scheme(url):
"""
Ensure the URL has a scheme. If not, prepend 'http://'
so it parses as host + path instead of just a path.
"""
# Regex to check if URL has scheme like 'http://', 'https://', etc.
if not re.match(r'^[a-zA-Z][a-zA-Z0-9+\-.]*://', url):
url = f"http://{url}"
return urlparse(url)
def normalize_url(url):
# Parse the URL
parsed_url = urlparse(url)
# Parse the URL; if no scheme is present, assume 'http'
parsed_url = parse_with_default_scheme(url)
scheme = parsed_url.scheme.lower()
# If no scheme is present, assume 'http'
scheme = parsed_url.scheme if parsed_url.scheme else 'http'
# Extract the domain parts using tldextract
extracted = tldextract.extract(parsed_url.netloc.lower())
# e.g., extracted = TLDExtractResult(subdomain='www', domain='airbnb', suffix='com.sg')
# Drop 'www' if it's the only subdomain
subdomain = extracted.subdomain
if subdomain == 'www':
subdomain = ''
# Lowercase the scheme and netloc, remove 'www.', and handle trailing slash
normalized_netloc = parsed_url.netloc.lower().replace("www.", "")
# Instead of using the suffix (e.g., 'com', 'com.sg'), ignore it completely
# so that both 'airbnb.com' and 'airbnb.com.sg' become just 'airbnb' or 'www.airbnb'
if subdomain:
normalized_netloc = f"{subdomain}.{extracted.domain}"
else:
normalized_netloc = extracted.domain
# Handle trailing slash in the path
normalized_path = parsed_url.path if parsed_url.path != '/' else ''
# Reassemble the URL with normalized components
normalized_parsed_url = parsed_url._replace(scheme=scheme.lower(), netloc=normalized_netloc,
path=normalized_path)
normalized_url = urlunparse(normalized_parsed_url)
# Reassemble the URL with the normalized components
normalized_parsed_url = ParseResult(
scheme=scheme.lower(),
netloc=normalized_netloc,
path=normalized_path,
params=parsed_url.params if full else '', # Keep the params
query=parsed_url.query if full else '', # Keep the query string
fragment=parsed_url.fragment if full else '', # Keep the fragment
)
return urlunparse(normalized_parsed_url)
return normalized_url
# Normalize both URLs for comparison
logger.info(f"After normalization. url1: {normalize_url(url1)}; url2: {normalize_url(url2)}")
# Normalize both URLs
norm_url1 = normalize_url(url1)
norm_url2 = normalize_url(url2)

View File

@@ -37,7 +37,7 @@ DEFAULT_REGION = "us-east-1"
# todo: public the AMI images
IMAGE_ID_MAP = {
"us-east-1": {
(1920, 1080): "ami-09138bff939f82bd8"
(1920, 1080): "ami-0d23263edb96951d8"
},
"ap-east-1": {
(1920, 1080): "ami-0c092a5b8be4116f5"

View File

@@ -62,3 +62,4 @@ azure-mgmt-network
docker
loguru
dotenv
tldextract