fix: improve EPUB processing by checking for file existence before reading

- Added checks for the presence of "toc.ncx" and "content.opf" in the EPUB file before attempting to process them. - Introduced debug logging to notify when these files are not found, enhancing error handling and traceability. - Maintained existing logic while improving robustness of the EPUB processing function.
2025-07-26 20:42:18 +00:00
parent b25854edba
commit 122b16742b
1 changed files with 28 additions and 16 deletions
--- a/desktop_env/evaluators/metrics/others.py
+++ b/desktop_env/evaluators/metrics/others.py
@@ -23,22 +23,34 @@ def process_epub(filename: str) -> List[str]:
    try:
        with zipfile.ZipFile(filename, "r") as z_f:
-            with z_f.open("toc.ncx") as in_f \
+            # Get list of all files in the zip archive
-                    , open(os.path.join(base_dir, "toc.ncx"), "w") as out_f:
+            zip_file_list = z_f.namelist()
-                contents: str = in_f.read().decode()
+            
-                contents = contents.splitlines()
+            # Process toc.ncx if it exists
-                for l in contents:
+            if "toc.ncx" in zip_file_list:
-                    if "navPoint" not in l:
+                with z_f.open("toc.ncx") as in_f \
-                        out_f.write(l + "\n")
+                        , open(os.path.join(base_dir, "toc.ncx"), "w") as out_f:
-            file_list.append(os.path.join(base_dir, "toc.ncx"))
+                    contents: str = in_f.read().decode()
-            with z_f.open("content.opf") as in_f \
+                    contents = contents.splitlines()
-                    , open(os.path.join(base_dir, "content.opf"), "w") as out_f:
+                    for l in contents:
-                contents: str = in_f.read().decode()
+                        if "navPoint" not in l:
-                contents = contents.splitlines()
+                            out_f.write(l + "\n")
-                for l in contents:
+                file_list.append(os.path.join(base_dir, "toc.ncx"))
-                    if "dc:identifier" not in l:
+            else:
-                        out_f.write(l + "\n")
+                logger.debug("toc.ncx not found in epub file: %s", filename)
-            file_list.append(os.path.join(base_dir, "content.opf"))
+            
            # Process content.opf if it exists
            if "content.opf" in zip_file_list:
                with z_f.open("content.opf") as in_f \
                        , open(os.path.join(base_dir, "content.opf"), "w") as out_f:
                    contents: str = in_f.read().decode()
                    contents = contents.splitlines()
                    for l in contents:
                        if "dc:identifier" not in l:
                            out_f.write(l + "\n")
                file_list.append(os.path.join(base_dir, "content.opf"))
            else:
                logger.debug("content.opf not found in epub file: %s", filename)
            for f_n in z_f.namelist():
                if f_n.endswith(".html"):
                    with z_f.open(f_n) as in_f \