fix: improve EPUB processing by checking for file existence before reading

- Added checks for the presence of "toc.ncx" and "content.opf" in the EPUB file before attempting to process them.
- Introduced debug logging to notify when these files are not found, enhancing error handling and traceability.
- Maintained existing logic while improving robustness of the EPUB processing function.
This commit is contained in:
yuanmengqi
2025-07-26 20:42:18 +00:00
parent b25854edba
commit 122b16742b

View File

@@ -23,22 +23,34 @@ def process_epub(filename: str) -> List[str]:
try: try:
with zipfile.ZipFile(filename, "r") as z_f: with zipfile.ZipFile(filename, "r") as z_f:
with z_f.open("toc.ncx") as in_f \ # Get list of all files in the zip archive
, open(os.path.join(base_dir, "toc.ncx"), "w") as out_f: zip_file_list = z_f.namelist()
contents: str = in_f.read().decode()
contents = contents.splitlines() # Process toc.ncx if it exists
for l in contents: if "toc.ncx" in zip_file_list:
if "navPoint" not in l: with z_f.open("toc.ncx") as in_f \
out_f.write(l + "\n") , open(os.path.join(base_dir, "toc.ncx"), "w") as out_f:
file_list.append(os.path.join(base_dir, "toc.ncx")) contents: str = in_f.read().decode()
with z_f.open("content.opf") as in_f \ contents = contents.splitlines()
, open(os.path.join(base_dir, "content.opf"), "w") as out_f: for l in contents:
contents: str = in_f.read().decode() if "navPoint" not in l:
contents = contents.splitlines() out_f.write(l + "\n")
for l in contents: file_list.append(os.path.join(base_dir, "toc.ncx"))
if "dc:identifier" not in l: else:
out_f.write(l + "\n") logger.debug("toc.ncx not found in epub file: %s", filename)
file_list.append(os.path.join(base_dir, "content.opf"))
# Process content.opf if it exists
if "content.opf" in zip_file_list:
with z_f.open("content.opf") as in_f \
, open(os.path.join(base_dir, "content.opf"), "w") as out_f:
contents: str = in_f.read().decode()
contents = contents.splitlines()
for l in contents:
if "dc:identifier" not in l:
out_f.write(l + "\n")
file_list.append(os.path.join(base_dir, "content.opf"))
else:
logger.debug("content.opf not found in epub file: %s", filename)
for f_n in z_f.namelist(): for f_n in z_f.namelist():
if f_n.endswith(".html"): if f_n.endswith(".html"):
with z_f.open(f_n) as in_f \ with z_f.open(f_n) as in_f \