Code clean

This commit is contained in:
Timothyxxx
2024-02-23 15:40:26 +08:00
parent 1610358e08
commit 7427b39d1d
2 changed files with 121 additions and 233 deletions

View File

@@ -284,18 +284,6 @@ def compare_contains_image(docx_file1, docx_file2):
return 1
# file1 = 'path/to/file1.docx'
# file2 = 'path/to/file2.docx'
# print(are_docx_files_same(file1, file2))
# Replace 'your_document.docx' with the path to your document
# result = contains_page_break('your_document.docx')
# print(result)
# config_path = "/home/[username]/.config/libreoffice/4/user/registrymodifications.xcu"
# print(find_default_font("Ani", config_path))
def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs):
if not compare_docx_files(file_path1, file_path2):
return 0
@@ -538,12 +526,3 @@ def compare_highlighted_text(file1, file2):
return 1
else:
return 0
if __name__ == '__main__':
print(
compare_docx_files(
r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\88fe4b2d-3040-4c70-9a70-546a47764b48\CCCH9003_Tutorial_guidelines.docx",
r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\88fe4b2d-3040-4c70-9a70-546a47764b48\CCCH9003_Tutorial_guidelines_Gold.docx",
ignore_blanks=False
)
)

View File

@@ -1,46 +1,47 @@
import builtins
import functools
import itertools
import logging
import operator
import re
import zipfile
from typing import Any, TypeVar, Union, Iterable, Optional, Callable
from typing import Dict, List, Set, Match, Tuple, Pattern
from urllib.parse import urlparse, urlunparse
import re
import functools
import operator
import builtins
import itertools
import formulas
import lxml.cssselect
import lxml.etree
import openpyxl
import xmltodict
from lxml.etree import _Element
from openpyxl import Workbook
from openpyxl.cell.cell import Cell
from openpyxl.chart._chart import ChartBase
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.worksheet.filters import AutoFilter, SortState
from openpyxl.formatting.formatting import ConditionalFormattingList
from openpyxl.pivot.cache import CacheSource as PivotCacheSource
from openpyxl.pivot.table import TableDefinition as PivotTableDefinition
from openpyxl.styles.differential import DifferentialStyle
from openpyxl.utils import coordinate_to_tuple, get_column_letter
from openpyxl.worksheet.cell_range import MultiCellRange, CellRange
from openpyxl.worksheet.dimensions import DimensionHolder
from openpyxl.formatting.formatting import ConditionalFormattingList
from openpyxl.utils import coordinate_to_tuple, get_column_letter
from openpyxl.cell.cell import Cell
from openpyxl.styles.differential import DifferentialStyle
from openpyxl.pivot.table import TableDefinition as PivotTableDefinition
from openpyxl.pivot.cache import CacheSource as PivotCacheSource
import formulas
from openpyxl.worksheet.filters import AutoFilter, SortState
from openpyxl.worksheet.worksheet import Worksheet
V = TypeVar("Value")
logger = logging.getLogger("desktopenv.metrics.utils")
_xlsx_namespaces = [ ("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main")
, ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main")
, ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
]
_xlsx_namespaces = [("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main")
, ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main")
, ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
]
_xlsx_ns_mapping = dict(_xlsx_namespaces)
_xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces))
_xlsx_ns_imapping["http://schemas.openxmlformats.org/spreadsheetml/2006/main"] = None
_sheet_name_selector = lxml.cssselect.CSSSelector("oo|sheets>oo|sheet", namespaces=_xlsx_ns_mapping)
_sparklines_selector = lxml.cssselect.CSSSelector("x14|sparkline", namespaces=_xlsx_ns_mapping)
def load_sparklines(xlsx_file: str, sheet_name: str) -> Dict[str, str]:
# function load_sparklines {{{ #
"""
@@ -174,6 +175,7 @@ def load_charts(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, An
return chart_set
# }}} function load_charts #
# Available Pivot Properties:
# name: str
# show_total, show_empty_row, show_empty_col, show_headers: bool
@@ -210,23 +212,26 @@ def load_pivot_tables(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[s
pivot_set: Dict[str, Any] = {}
pivot_props: Set[str] = set(options.get("pivot_props", []))
for pvt in pivots:
raw_selection: List[List[tuple[Optional[bool], int]]] =\
[ [(itm.h, itm.x) for itm in f.items if itm.x is not None]\
for f in pvt.pivotFields
]
raw__selection: List[List[tuple[Optional[bool], int]]] = list(itertools.dropwhile(lambda r: len(r)==0, raw_selection))
left_bias = len(raw_selection)-len(raw__selection)
selection: List[List[tuple[Optional[bool], int]]] = list((itertools.dropwhile(lambda r: len(r)==0, reversed(raw__selection))))[::-1]
right_bias = len(raw__selection)-len(selection)
raw_selection: List[List[tuple[Optional[bool], int]]] = \
[[(itm.h, itm.x) for itm in f.items if itm.x is not None] \
for f in pvt.pivotFields
]
raw__selection: List[List[tuple[Optional[bool], int]]] = list(
itertools.dropwhile(lambda r: len(r) == 0, raw_selection))
left_bias = len(raw_selection) - len(raw__selection)
selection: List[List[tuple[Optional[bool], int]]] = list(
(itertools.dropwhile(lambda r: len(r) == 0, reversed(raw__selection))))[::-1]
right_bias = len(raw__selection) - len(selection)
cache_source: PivotCacheSource = pvt.cache.cacheSource
cell_range1: str
cell_range2: str
cell_range1, cell_range2 = cache_source.worksheetSource.ref.split(":")
cell_range1: Tuple[int, int] = coordinate_to_tuple(cell_range1)
cell_range1 = (cell_range1[0], cell_range1[1]+left_bias)
cell_range1 = (cell_range1[0], cell_range1[1] + left_bias)
cell_range2: Tuple[int, int] = coordinate_to_tuple(cell_range2)
cell_range2 = (cell_range2[0], cell_range2[1]-right_bias)
source: str = "{:};{:}:{:};{:}".format(cache_source.type, cell_range1, cell_range2, cache_source.worksheetSource.sheet)
cell_range2 = (cell_range2[0], cell_range2[1] - right_bias)
source: str = "{:};{:}:{:};{:}".format(cache_source.type, cell_range1, cell_range2,
cache_source.worksheetSource.sheet)
info: Dict[str, Any] = {}
if "name" in pivot_props:
@@ -248,22 +253,26 @@ def load_pivot_tables(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[s
if "filter" in pivot_props:
info["filter_fields"] = set(f.fld for f in pvt.pageFields)
if "col_fields" in pivot_props:
info["col_fields"] = [f.x-left_bias for f in pvt.colFields]
info["col_fields"] = [f.x - left_bias for f in pvt.colFields]
if "row_fields" in pivot_props:
info["row_fields"] = [f.x-left_bias for f in pvt.rowFields]
info["row_fields"] = [f.x - left_bias for f in pvt.rowFields]
if "data_fields" in pivot_props:
info["data_fields"] = [ "{:d};{:};{:};{:}".format( f.fld-left_bias, f.name if "data_fields_name" in pivot_props else ""
, f.subtotal, f.showDataAs
)\
for f in pvt.dataFields
]
info["data_fields"] = [
"{:d};{:};{:};{:}".format(f.fld - left_bias, f.name if "data_fields_name" in pivot_props else ""
, f.subtotal, f.showDataAs
) \
for f in pvt.dataFields
]
pivot_set[source] = info
logger.debug(".[%s].pivots: %s", sheet_name, repr(pivot_set))
return pivot_set
# }}} function load_pivot_tables #
_shared_str_selector = lxml.cssselect.CSSSelector("oo|sst>oo|si>oo|t", namespaces=_xlsx_ns_mapping)
def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
# read_cell_value {{{ #
try:
@@ -283,20 +292,20 @@ def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
with z_f.open("xl/worksheets/sheet{:}.xml".format(sheet_names[sheet_name])) as f:
sheet: _Element = lxml.etree.fromstring(f.read())
cells: List[_Element] =\
lxml.cssselect.CSSSelector( 'oo|row>oo|c[r="{:}"]'.format(coordinate)
, namespaces=_xlsx_ns_mapping
)(sheet)
if len(cells)==0:
cells: List[_Element] = \
lxml.cssselect.CSSSelector('oo|row>oo|c[r="{:}"]'.format(coordinate)
, namespaces=_xlsx_ns_mapping
)(sheet)
if len(cells) == 0:
return None
cell: _Element = cells[0]
except zipfile.BadZipFile:
return None
cell: Dict[str, str] = xmltodict.parse( lxml.etree.tostring(cell, encoding="unicode")
, process_namespaces=True
, namespaces=_xlsx_ns_imapping
)
cell: Dict[str, str] = xmltodict.parse(lxml.etree.tostring(cell, encoding="unicode")
, process_namespaces=True
, namespaces=_xlsx_ns_imapping
)
logger.debug("%s.%s[%s]: %s", xlsx_file, sheet_name, coordinate, repr(cell))
if "@t" not in cell["c"]:
return None
@@ -308,6 +317,7 @@ def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
return cell["c"]["v"]
# }}} read_cell_value #
# Supported Styles:
# number_format
# font_name - str
@@ -322,50 +332,53 @@ def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
# fgcolor - in aRGB, e.g., FF00FFFF is yellow
# hyperlink - str
def _read_cell_style(style_name: str, cell: Cell, diff_style: Optional[DifferentialStyle] = None) -> Any:
if style_name=="number_format":
return (cell.number_format if diff_style is None else diff_style.numFmt.formatCode)\
if cell.value is not None and cell.data_type=="n" else None
elif style_name=="font_name":
if style_name == "number_format":
return (cell.number_format if diff_style is None else diff_style.numFmt.formatCode) \
if cell.value is not None and cell.data_type == "n" else None
elif style_name == "font_name":
return (diff_style or cell).font.name if cell.value is not None else None
elif style_name=="font_family":
elif style_name == "font_family":
return (diff_style or cell).font.family if cell.value is not None else None
elif style_name=="font_color":
elif style_name == "font_color":
return (diff_style or cell).font.color.rgb if cell.value is not None else None
elif style_name=="font_bold":
elif style_name == "font_bold":
return (diff_style or cell).font.bold if cell.value is not None else None
elif style_name=="font_italic":
elif style_name == "font_italic":
return (diff_style or cell).font.italic if cell.value is not None else None
elif style_name=="font_underline":
elif style_name == "font_underline":
return (diff_style or cell).font.underline if cell.value is not None else None
elif style_name=="font_size":
elif style_name == "font_size":
return (diff_style or cell).font.size if cell.value is not None else None
elif style_name=="fill_type":
elif style_name == "fill_type":
try:
return (diff_style or cell).fill.tagname
except:
return None
elif style_name=="bgcolor":
elif style_name == "bgcolor":
try:
return (diff_style or cell).fill.bgColor.rgb
except:
return None
elif style_name=="fgcolor":
elif style_name == "fgcolor":
try:
return (diff_style or cell).fill.fgColor.rgb
except:
return None
elif style_name=="hyperlink":
elif style_name == "hyperlink":
return cell.hyperlink or "" if cell.value is not None else None
else:
raise NotImplementedError("Unsupported Style: {:}".format(style_name))
_absolute_range_pattern: Pattern[str] = re.compile( r"""\$(?P<col1>[A-Z]{1,3})\$(?P<row1>\d+) # coord1
_absolute_range_pattern: Pattern[str] = re.compile(r"""\$(?P<col1>[A-Z]{1,3})\$(?P<row1>\d+) # coord1
(?::
\$(?P<col2>[A-Z]{1,3})\$(?P<row2>\d+) # coord2
)?
"""
, re.X
)
, re.X
)
def load_xlsx_styles(xlsx_file: Workbook, sheet_name: str, book_name: str, **options) -> Dict[str, List[Any]]:
# function load_xlsx_styles {{{ #
"""
@@ -417,24 +430,24 @@ def load_xlsx_styles(xlsx_file: Workbook, sheet_name: str, book_name: str, **opt
if m[2] is None and m[3] is None:
arguments.append(read_cell_value(book_name, sheet_name, coordinate="{:}{:}".format(m[0], m[1])))
else:
arguments.append( [ read_cell_value( book_name, sheet_name
, coordinate="{:}{:}".format( get_column_letter(c[1])
arguments.append([read_cell_value(book_name, sheet_name
, coordinate="{:}{:}".format(get_column_letter(c[1])
, c[0]
)
)\
for c in CellRange("{:}{:}:{:}{:}".format(m[0], m[1], m[2], m[3])).cells\
) \
for c in CellRange("{:}{:}:{:}{:}".format(m[0], m[1], m[2], m[3])).cells \
]
)
)
logger.debug("Absolute range arguments: %s", repr(arguments))
for rge in fmt.cells:
for c in rge.cells:
cell: Cell = worksheet.cell(row=c[0], column=c[1])
cell_value = read_cell_value( book_name, sheet_name
, coordinate="{:}{:d}".format( get_column_letter(c[1])
, c[0]
)
)
cell_value = read_cell_value(book_name, sheet_name
, coordinate="{:}{:d}".format(get_column_letter(c[1])
, c[0]
)
)
if condition(cell_value, *arguments):
logger.debug("Active Cell %s(%s) for %s", repr(cell), str(cell_value), r.formula[0])
active_cells.append(cell)
@@ -448,6 +461,7 @@ def load_xlsx_styles(xlsx_file: Workbook, sheet_name: str, book_name: str, **opt
return style_dict
# }}} function load_xlsx_styles #
# Available Row Properties:
# hidden
# collapsed
@@ -460,7 +474,7 @@ def load_xlsx_styles(xlsx_file: Workbook, sheet_name: str, book_name: str, **opt
# collapsed
# min
# max
def load_rows_or_cols(xlsx_file: Workbook, sheet_name: str, **options)\
def load_rows_or_cols(xlsx_file: Workbook, sheet_name: str, **options) \
-> Dict[Union[int, str], Dict[str, Any]]:
# function load_rows_or_cols {{{ #
"""
@@ -491,6 +505,7 @@ def load_rows_or_cols(xlsx_file: Workbook, sheet_name: str, **options)\
return obj_set
# }}} function load_rows_or_cols #
def load_filters(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, Any]:
# function load_filters {{{ #
try:
@@ -514,16 +529,16 @@ def load_filters(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, A
filter_column["filters"] = set(flt_clm.filters.filter)
if flt_clm.customFilters is not None:
filter_column["custom_filters_op"] = flt_clm.customFilters._and
filter_column["custom_filters"] = set( ( flt.operator
filter_column["custom_filters"] = set((flt.operator
, flt.val
)\
for flt in flt_clm.customFilters.customFilter
)
) \
for flt in flt_clm.customFilters.customFilter
)
filter_column_set.append(filter_column)
filter_column_set = list( sorted( filter_column_set
filter_column_set = list(sorted(filter_column_set
, key=(lambda d: d["col_id"])
)
)
)
filter_dict["filter_column"] = filter_column_set
# sortState
@@ -534,26 +549,30 @@ def load_filters(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, A
sort_state_dict["case"] = sort_state.caseSensitive
sort_state_dict["method"] = sort_state.sortMethod
sort_state_dict["ref"] = sort_state.ref
sort_state_dict["condition"] = list( { "descending": cdt.descending
, "key": cdt.sortBy
, "ref": cdt.ref
, "custom_list": cdt.customList
, "dxf_id": cdt.dxfId
, "icon": cdt.iconSet
, "iconid": cdt.iconId
}\
for cdt in sort_state.sortCondition
)
sort_state_dict["condition"] = list({"descending": cdt.descending
, "key": cdt.sortBy
, "ref": cdt.ref
, "custom_list": cdt.customList
, "dxf_id": cdt.dxfId
, "icon": cdt.iconSet
, "iconid": cdt.iconId
} \
for cdt in sort_state.sortCondition
)
filter_dict["sort_state"] = sort_state_dict
return filter_dict
# }}} function load_filters #
def _match_record(pattern: Dict[str, Any], item: Dict[str, Any]) -> bool:
return all(k in item and item[k] == val for k, val in pattern.items())
def _multicellrange_containsby(subset_candidate: MultiCellRange, superset_candidate: MultiCellRange) -> bool:
return all(r in superset_candidate for r in subset_candidate)
def _match_value_to_rule(value: V, rule: Dict[str, Union[str, V]]) -> bool:
"""
Args:
@@ -576,10 +595,10 @@ def _match_value_to_rule(value: V, rule: Dict[str, Union[str, V]]) -> bool:
match_: Optional[Match[str]] = re.search(rule["ref"], value, flag)
return match_ is not None
if rule["method"] in { "eq", "ne"
, "le", "lt"
, "ge", "gt"
}:
if rule["method"] in {"eq", "ne"
, "le", "lt"
, "ge", "gt"
}:
return getattr(operator, rule["method"])(value, rule["ref"])
if rule["method"].startswith("approx"):
threshold: float = float(rule["method"].split(":")[1])
@@ -589,26 +608,27 @@ def _match_value_to_rule(value: V, rule: Dict[str, Union[str, V]]) -> bool:
except (ValueError, TypeError):
return False
else:
return abs(value-rule["ref"])<=threshold
return abs(value - rule["ref"]) <= threshold
if rule["method"] == "spreadsheet_range":
subset_limit = MultiCellRange(rule["ref"][0])
superset_limit = MultiCellRange(rule["ref"][1])
return _multicellrange_containsby(subset_limit, value)\
and _multicellrange_containsby(value, superset_limit)
if rule["method"].startswith("range."): # e.g., range.te [0, 2] -> 0 < x <= 2
return _multicellrange_containsby(subset_limit, value) \
and _multicellrange_containsby(value, superset_limit)
if rule["method"].startswith("range."): # e.g., range.te [0, 2] -> 0 < x <= 2
left_et = rule["method"][6]
right_et = rule["method"][7]
return getattr(operator, "l" + left_et)(rule["ref"][0], value)\
and getattr(operator, "l" + right_et)(value, rule["ref"][1])
return getattr(operator, "l" + left_et)(rule["ref"][0], value) \
and getattr(operator, "l" + right_et)(value, rule["ref"][1])
if rule["method"] in {"str_list_eq", "str_set_eq"}:
container_type_str: str = rule["method"][4:-3]
container_type = getattr(builtins, container_type_str)
value: container_type = container_type(value.strip("\"'").split(","))
ref: container_type = container_type(rule["ref"])
return value==ref
return value == ref
raise NotImplementedError()
def are_lists_equal(list1, list2, comparison_func):
# First check if both lists have the same length
if len(list1) != len(list2):
@@ -652,114 +672,3 @@ def compare_urls(url1, url2):
# Compare the normalized URLs
return norm_url1 == norm_url2
if __name__ == "__main__":
path1 = "test.xlsx"
#path1 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold.xlsx"
path1 = "../../任务集/SheetCopilot/dataset/task_sheet_answers_v2/BoomerangSales/2_BoomerangSales/2_BoomerangSales_gt1.xlsx"
workbook1: Workbook = openpyxl.load_workbook(filename=path1)
worksheet1: Worksheet = workbook1.active
#charts: List[ChartBase] = worksheet1._charts
# print(len(charts))
# print(type(charts[0]))
#
# print(len(charts[0].series))
# print(type(charts[0].series[0]))
# print(type(charts[0].series[0].val))
##print(charts[0].series[0].val)
# print(charts[0].series[0].val.numRef.f)
#
# print(type(charts[0].series[0].cat))
##print(charts[0].series[0].cat)
# print(charts[0].series[0].cat.numRef)
# print(charts[0].series[0].cat.strRef)
# print(charts[0].series[0].cat.strRef.f)
# print(type(charts[0].title.tx.strRef))
# print(type(charts[0].title.tx.rich))
# print(type(charts[0].title.txPr))
# print(len(charts[0].title.tx.rich.p))
# print(len(charts[0].title.tx.rich.p[0].r))
# print(type(charts[0].title.tx.rich.p[0].r[0]))
# print(type(charts[0].title.tx.rich.p[0].r[0].t))
# print(charts[0].title.tx.rich.p[0].r[0].t)
# print(type(charts[0].anchor))
# print(charts[0].anchor.editAs)
# print(charts[0].anchor._from.col, charts[0].anchor.to.row)
# print(charts[0].anchor.to.col, charts[0].anchor.to.row)
# df1 = pd.read_excel(path1)
# print(df1)
#print(load_charts(path1, chart_props=["title", "xtitle", "ytitle", "type"]))
#print(type(worksheet1["A1"].hyperlink))
#print(worksheet1["A1"].hyperlink)
#print(worksheet1._charts[0].legend)
#print(worksheet1._charts[0].legend.position)
#for entr in worksheet1._charts[0].legend.legendEntry:
#print("Entr", entr.txPr.p[0].r[0].t)
#print(load_filters(workbook1, "工作表1"))
#print(worksheet1.auto_filter)
#for pvt in worksheet1._pivots:
##print(type(pvt))
##print(pvt)
#print(type(pvt.cache))
##print(pvt.cache)
#print(pvt.cache.cacheSource.type)
#print(pvt.cache.cacheSource.worksheetSource.ref)
#print(pvt.cache.cacheSource.worksheetSource.sheet)
#
#print(type(pvt.location))
#print(pvt.location)
#for f in pvt.pivotFields:
#print(type(f))
#print([(itm.h, itm.x) for itm in f.items])
##for f_itm in f.items:
##print(f_itm.n)
##print(f_itm.t)
##print(f_itm.h)
##print(f_itm.s)
##print(f_itm.sd)
##print(f_itm.f)
##print(f_itm.m)
##print(f_itm.c)
##print(f_itm.x)
##print(f_itm.d)
##print(f_itm.e)
##print(f.countASubtotal)
##print(f.countSubtotal)
##for f in pvt.dataFields:
##print(f.name)
##print(f.fld)
###print(f.baseField)
##print(f.subtotal)
##print(f.showDataAs)
##for f in pvt.rowFields:
##print(1, f.x)
##for f in pvt.rowItems:
##print(2, f.t, f.r, f.i, f.x)
##for f in pvt.colFields:
##print(3, f.x)
##for f in pvt.colItems:
##print(4, f.t, f.r, f.i, f.x)
#for f in pvt.pageFields:
#print(5, f.fld)
#for flt in pvt.filters:
#print(5, flt.fld)
#print(6, flt.mpFld)
#print(7, flt.type)
#print(8, flt.evalOrder)
#print(9, flt.id)
#print(10, flt.stringValue1)
#print(11, flt.stringValue2)
#print(load_charts(workbook1, "Sheet2", chart_props=["title", "type", "legend"]))
#print(load_filters(workbook1, "透视表_工作表1_1"))
#workbook1.save("test2.xlsx")
print( load_pivot_tables( workbook1, "Sheet2", pivot_props=[ "col_fields"
, "filter"
, "row_fields"
, "data_fields"
]
)
)