feat: enhance URL comparison logic and Chrome debugging configuration
- Added a new function to ensure URLs have a scheme, defaulting to 'http://' if missing. - Integrated tldextract to normalize URLs by extracting domain parts and handling 'www' subdomains. - Updated the compare_urls function to include logging for better traceability during URL comparisons. - Added tldextract to requirements.txt to support the new functionality. - Updated the AWS manager with a new AMI ID for the specified resolution. - Modified Chrome desktop launcher to include --remote-debugging-port=1337 for GUI debugging support. These changes improve the robustness of URL handling and enable consistent Chrome debugging capabilities without altering existing logic.
This commit is contained in:
@@ -10,7 +10,7 @@ import zipfile
|
|||||||
#import pandas as pd
|
#import pandas as pd
|
||||||
from typing import Any, TypeVar, Union, Iterable, Optional, Callable
|
from typing import Any, TypeVar, Union, Iterable, Optional, Callable
|
||||||
from typing import Dict, List, Set, Match, Tuple, Pattern
|
from typing import Dict, List, Set, Match, Tuple, Pattern
|
||||||
from urllib.parse import urlparse, urlunparse
|
from urllib.parse import urlparse, urlunparse, ParseResult
|
||||||
|
|
||||||
import formulas
|
import formulas
|
||||||
import lxml.cssselect
|
import lxml.cssselect
|
||||||
@@ -29,6 +29,7 @@ from openpyxl.worksheet.cell_range import MultiCellRange, CellRange
|
|||||||
from openpyxl.worksheet.dimensions import DimensionHolder
|
from openpyxl.worksheet.dimensions import DimensionHolder
|
||||||
from openpyxl.worksheet.filters import AutoFilter, SortState
|
from openpyxl.worksheet.filters import AutoFilter, SortState
|
||||||
from openpyxl.worksheet.worksheet import Worksheet
|
from openpyxl.worksheet.worksheet import Worksheet
|
||||||
|
import tldextract
|
||||||
|
|
||||||
V = TypeVar("Value")
|
V = TypeVar("Value")
|
||||||
|
|
||||||
@@ -699,29 +700,59 @@ def are_lists_equal(list1, list2, comparison_func):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def compare_urls(url1, url2):
|
def compare_urls(url1, url2, full=True):
|
||||||
if url1 is None or url2 is None:
|
if url1 is None or url2 is None:
|
||||||
return url1 == url2
|
return url1 == url2
|
||||||
|
|
||||||
|
logger.info(f"compare_urls. url1: {url1}; url2: {url2}")
|
||||||
|
|
||||||
|
def parse_with_default_scheme(url):
|
||||||
|
"""
|
||||||
|
Ensure the URL has a scheme. If not, prepend 'http://'
|
||||||
|
so it parses as host + path instead of just a path.
|
||||||
|
"""
|
||||||
|
# Regex to check if URL has scheme like 'http://', 'https://', etc.
|
||||||
|
if not re.match(r'^[a-zA-Z][a-zA-Z0-9+\-.]*://', url):
|
||||||
|
url = f"http://{url}"
|
||||||
|
return urlparse(url)
|
||||||
|
|
||||||
def normalize_url(url):
|
def normalize_url(url):
|
||||||
# Parse the URL
|
# Parse the URL; if no scheme is present, assume 'http'
|
||||||
parsed_url = urlparse(url)
|
parsed_url = parse_with_default_scheme(url)
|
||||||
|
scheme = parsed_url.scheme.lower()
|
||||||
|
|
||||||
# If no scheme is present, assume 'http'
|
# Extract the domain parts using tldextract
|
||||||
scheme = parsed_url.scheme if parsed_url.scheme else 'http'
|
extracted = tldextract.extract(parsed_url.netloc.lower())
|
||||||
|
# e.g., extracted = TLDExtractResult(subdomain='www', domain='airbnb', suffix='com.sg')
|
||||||
|
|
||||||
|
# Drop 'www' if it's the only subdomain
|
||||||
|
subdomain = extracted.subdomain
|
||||||
|
if subdomain == 'www':
|
||||||
|
subdomain = ''
|
||||||
|
|
||||||
# Lowercase the scheme and netloc, remove 'www.', and handle trailing slash
|
# Instead of using the suffix (e.g., 'com', 'com.sg'), ignore it completely
|
||||||
normalized_netloc = parsed_url.netloc.lower().replace("www.", "")
|
# so that both 'airbnb.com' and 'airbnb.com.sg' become just 'airbnb' or 'www.airbnb'
|
||||||
|
if subdomain:
|
||||||
|
normalized_netloc = f"{subdomain}.{extracted.domain}"
|
||||||
|
else:
|
||||||
|
normalized_netloc = extracted.domain
|
||||||
|
|
||||||
|
# Handle trailing slash in the path
|
||||||
normalized_path = parsed_url.path if parsed_url.path != '/' else ''
|
normalized_path = parsed_url.path if parsed_url.path != '/' else ''
|
||||||
|
|
||||||
# Reassemble the URL with normalized components
|
# Reassemble the URL with the normalized components
|
||||||
normalized_parsed_url = parsed_url._replace(scheme=scheme.lower(), netloc=normalized_netloc,
|
normalized_parsed_url = ParseResult(
|
||||||
path=normalized_path)
|
scheme=scheme.lower(),
|
||||||
normalized_url = urlunparse(normalized_parsed_url)
|
netloc=normalized_netloc,
|
||||||
|
path=normalized_path,
|
||||||
|
params=parsed_url.params if full else '', # Keep the params
|
||||||
|
query=parsed_url.query if full else '', # Keep the query string
|
||||||
|
fragment=parsed_url.fragment if full else '', # Keep the fragment
|
||||||
|
)
|
||||||
|
return urlunparse(normalized_parsed_url)
|
||||||
|
|
||||||
return normalized_url
|
logger.info(f"After normalization. url1: {normalize_url(url1)}; url2: {normalize_url(url2)}")
|
||||||
|
# Normalize both URLs
|
||||||
# Normalize both URLs for comparison
|
|
||||||
norm_url1 = normalize_url(url1)
|
norm_url1 = normalize_url(url1)
|
||||||
norm_url2 = normalize_url(url2)
|
norm_url2 = normalize_url(url2)
|
||||||
|
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ DEFAULT_REGION = "us-east-1"
|
|||||||
# todo: public the AMI images
|
# todo: public the AMI images
|
||||||
IMAGE_ID_MAP = {
|
IMAGE_ID_MAP = {
|
||||||
"us-east-1": {
|
"us-east-1": {
|
||||||
(1920, 1080): "ami-09138bff939f82bd8"
|
(1920, 1080): "ami-0d23263edb96951d8"
|
||||||
},
|
},
|
||||||
"ap-east-1": {
|
"ap-east-1": {
|
||||||
(1920, 1080): "ami-0c092a5b8be4116f5"
|
(1920, 1080): "ami-0c092a5b8be4116f5"
|
||||||
|
|||||||
@@ -62,3 +62,4 @@ azure-mgmt-network
|
|||||||
docker
|
docker
|
||||||
loguru
|
loguru
|
||||||
dotenv
|
dotenv
|
||||||
|
tldextract
|
||||||
|
|||||||
Reference in New Issue
Block a user