fix: improve EPUB processing by checking for file existence before reading
- Added checks for the presence of "toc.ncx" and "content.opf" in the EPUB file before attempting to process them. - Introduced debug logging to notify when these files are not found, enhancing error handling and traceability. - Maintained existing logic while improving robustness of the EPUB processing function.
This commit is contained in:
@@ -23,22 +23,34 @@ def process_epub(filename: str) -> List[str]:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
with zipfile.ZipFile(filename, "r") as z_f:
|
with zipfile.ZipFile(filename, "r") as z_f:
|
||||||
with z_f.open("toc.ncx") as in_f \
|
# Get list of all files in the zip archive
|
||||||
, open(os.path.join(base_dir, "toc.ncx"), "w") as out_f:
|
zip_file_list = z_f.namelist()
|
||||||
contents: str = in_f.read().decode()
|
|
||||||
contents = contents.splitlines()
|
# Process toc.ncx if it exists
|
||||||
for l in contents:
|
if "toc.ncx" in zip_file_list:
|
||||||
if "navPoint" not in l:
|
with z_f.open("toc.ncx") as in_f \
|
||||||
out_f.write(l + "\n")
|
, open(os.path.join(base_dir, "toc.ncx"), "w") as out_f:
|
||||||
file_list.append(os.path.join(base_dir, "toc.ncx"))
|
contents: str = in_f.read().decode()
|
||||||
with z_f.open("content.opf") as in_f \
|
contents = contents.splitlines()
|
||||||
, open(os.path.join(base_dir, "content.opf"), "w") as out_f:
|
for l in contents:
|
||||||
contents: str = in_f.read().decode()
|
if "navPoint" not in l:
|
||||||
contents = contents.splitlines()
|
out_f.write(l + "\n")
|
||||||
for l in contents:
|
file_list.append(os.path.join(base_dir, "toc.ncx"))
|
||||||
if "dc:identifier" not in l:
|
else:
|
||||||
out_f.write(l + "\n")
|
logger.debug("toc.ncx not found in epub file: %s", filename)
|
||||||
file_list.append(os.path.join(base_dir, "content.opf"))
|
|
||||||
|
# Process content.opf if it exists
|
||||||
|
if "content.opf" in zip_file_list:
|
||||||
|
with z_f.open("content.opf") as in_f \
|
||||||
|
, open(os.path.join(base_dir, "content.opf"), "w") as out_f:
|
||||||
|
contents: str = in_f.read().decode()
|
||||||
|
contents = contents.splitlines()
|
||||||
|
for l in contents:
|
||||||
|
if "dc:identifier" not in l:
|
||||||
|
out_f.write(l + "\n")
|
||||||
|
file_list.append(os.path.join(base_dir, "content.opf"))
|
||||||
|
else:
|
||||||
|
logger.debug("content.opf not found in epub file: %s", filename)
|
||||||
for f_n in z_f.namelist():
|
for f_n in z_f.namelist():
|
||||||
if f_n.endswith(".html"):
|
if f_n.endswith(".html"):
|
||||||
with z_f.open(f_n) as in_f \
|
with z_f.open(f_n) as in_f \
|
||||||
|
|||||||
Reference in New Issue
Block a user