diff --git a/desktop_env/evaluators/metrics/utils.py b/desktop_env/evaluators/metrics/utils.py index c3280eb..694b147 100644 --- a/desktop_env/evaluators/metrics/utils.py +++ b/desktop_env/evaluators/metrics/utils.py @@ -10,7 +10,7 @@ import zipfile #import pandas as pd from typing import Any, TypeVar, Union, Iterable, Optional, Callable from typing import Dict, List, Set, Match, Tuple, Pattern -from urllib.parse import urlparse, urlunparse +from urllib.parse import urlparse, urlunparse, ParseResult import formulas import lxml.cssselect @@ -29,6 +29,7 @@ from openpyxl.worksheet.cell_range import MultiCellRange, CellRange from openpyxl.worksheet.dimensions import DimensionHolder from openpyxl.worksheet.filters import AutoFilter, SortState from openpyxl.worksheet.worksheet import Worksheet +import tldextract V = TypeVar("Value") @@ -699,29 +700,59 @@ def are_lists_equal(list1, list2, comparison_func): return True -def compare_urls(url1, url2): +def compare_urls(url1, url2, full=True): if url1 is None or url2 is None: return url1 == url2 + + logger.info(f"compare_urls. url1: {url1}; url2: {url2}") + + def parse_with_default_scheme(url): + """ + Ensure the URL has a scheme. If not, prepend 'http://' + so it parses as host + path instead of just a path. + """ + # Regex to check if URL has scheme like 'http://', 'https://', etc. + if not re.match(r'^[a-zA-Z][a-zA-Z0-9+\-.]*://', url): + url = f"http://{url}" + return urlparse(url) def normalize_url(url): - # Parse the URL - parsed_url = urlparse(url) + # Parse the URL; if no scheme is present, assume 'http' + parsed_url = parse_with_default_scheme(url) + scheme = parsed_url.scheme.lower() - # If no scheme is present, assume 'http' - scheme = parsed_url.scheme if parsed_url.scheme else 'http' + # Extract the domain parts using tldextract + extracted = tldextract.extract(parsed_url.netloc.lower()) + # e.g., extracted = TLDExtractResult(subdomain='www', domain='airbnb', suffix='com.sg') + + # Drop 'www' if it's the only subdomain + subdomain = extracted.subdomain + if subdomain == 'www': + subdomain = '' - # Lowercase the scheme and netloc, remove 'www.', and handle trailing slash - normalized_netloc = parsed_url.netloc.lower().replace("www.", "") + # Instead of using the suffix (e.g., 'com', 'com.sg'), ignore it completely + # so that both 'airbnb.com' and 'airbnb.com.sg' become just 'airbnb' or 'www.airbnb' + if subdomain: + normalized_netloc = f"{subdomain}.{extracted.domain}" + else: + normalized_netloc = extracted.domain + + # Handle trailing slash in the path normalized_path = parsed_url.path if parsed_url.path != '/' else '' - # Reassemble the URL with normalized components - normalized_parsed_url = parsed_url._replace(scheme=scheme.lower(), netloc=normalized_netloc, - path=normalized_path) - normalized_url = urlunparse(normalized_parsed_url) + # Reassemble the URL with the normalized components + normalized_parsed_url = ParseResult( + scheme=scheme.lower(), + netloc=normalized_netloc, + path=normalized_path, + params=parsed_url.params if full else '', # Keep the params + query=parsed_url.query if full else '', # Keep the query string + fragment=parsed_url.fragment if full else '', # Keep the fragment + ) + return urlunparse(normalized_parsed_url) - return normalized_url - - # Normalize both URLs for comparison + logger.info(f"After normalization. url1: {normalize_url(url1)}; url2: {normalize_url(url2)}") + # Normalize both URLs norm_url1 = normalize_url(url1) norm_url2 = normalize_url(url2) diff --git a/desktop_env/providers/aws/manager.py b/desktop_env/providers/aws/manager.py index 703359b..687b745 100644 --- a/desktop_env/providers/aws/manager.py +++ b/desktop_env/providers/aws/manager.py @@ -37,7 +37,7 @@ DEFAULT_REGION = "us-east-1" # todo: public the AMI images IMAGE_ID_MAP = { "us-east-1": { - (1920, 1080): "ami-09138bff939f82bd8" + (1920, 1080): "ami-0d23263edb96951d8" }, "ap-east-1": { (1920, 1080): "ami-0c092a5b8be4116f5" diff --git a/requirements.txt b/requirements.txt index 88a22e4..924fffe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -62,3 +62,4 @@ azure-mgmt-network docker loguru dotenv +tldextract