grok-build/latest/content · 0.2.69
skills/xlsx/scripts/office/validators/base.py
Skill30.9 KB234 lines
"""
Base validator with common validation logic for document files.
"""
import re
from pathlib import Path
import defusedxml.minidom
import lxml.etree
class BaseSchemaValidator:
IGNORED_VALIDATION_ERRORS = [
"hyphenationZone",
"purl.org/dc/terms",
]
UNIQUE_ID_REQUIREMENTS = {
"comment": ("id", "file"),
"commentrangestart": ("id", "file"),
"commentrangeend": ("id", "file"),
"bookmarkstart": ("id", "file"),
"bookmarkend": ("id", "file"),
"sldid": ("id", "file"),
"sldmasterid": ("id", "global"),
"sldlayoutid": ("id", "global"),
"cm": ("authorid", "file"),
"sheet": ("sheetid", "file"),
"definedname": ("id", "file"),
"cxnsp": ("id", "file"),
"sp": ("id", "file"),
"pic": ("id", "file"),
"grpsp": ("id", "file"),
}
EXCLUDED_ID_CONTAINERS = {
"sectionlst",
}
ELEMENT_RELATIONSHIP_TYPES = {}
SCHEMA_MAPPINGS = {
"word": "ISO-IEC29500-4_2016/wml.xsd",
"ppt": "ISO-IEC29500-4_2016/pml.xsd",
"xl": "ISO-IEC29500-4_2016/sml.xsd",
"[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
"app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
"core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
"custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
".rels": "ecma/fouth-edition/opc-relationships.xsd",
"people.xml": "microsoft/wml-2012.xsd",
"commentsIds.xml": "microsoft/wml-cid-2016.xsd",
"commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
"commentsExtended.xml": "microsoft/wml-2012.xsd",
"chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
"theme": "ISO-IEC29500-4_2016/dml-main.xsd",
"drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
}
MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
PACKAGE_RELATIONSHIPS_NAMESPACE = "http://schemas.openxmlformats.org/package/2006/relationships"
OFFICE_RELATIONSHIPS_NAMESPACE = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships"
)
CONTENT_TYPES_NAMESPACE = "http://schemas.openxmlformats.org/package/2006/content-types"
MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
OOXML_NAMESPACES = {
"http://schemas.openxmlformats.org/officeDocument/2006/math",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"http://schemas.openxmlformats.org/schemaLibrary/2006/main",
"http://schemas.openxmlformats.org/drawingml/2006/main",
"http://schemas.openxmlformats.org/drawingml/2006/chart",
"http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",
"http://schemas.openxmlformats.org/drawingml/2006/diagram",
"http://schemas.openxmlformats.org/drawingml/2006/picture",
"http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
"http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"http://schemas.openxmlformats.org/presentationml/2006/main",
"http://schemas.openxmlformats.org/spreadsheetml/2006/main",
"http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",
"http://www.w3.org/XML/1998/namespace",
}
def __init__(self, unpacked_dir, original_file=None, verbose=False):
self.unpacked_dir = Path(unpacked_dir).resolve()
self.original_file = Path(original_file) if original_file else None
self.verbose = verbose
self.schemas_dir = Path(__file__).parent.parent / "schemas"
patterns = ["*.xml", "*.rels"]
self.xml_files = [f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)]
if not self.xml_files:
print(f"Warning: No XML files found in {self.unpacked_dir}")
def validate(self):
raise NotImplementedError("Subclasses must implement the validate method")
def repair(self) -> int:
return self.repair_whitespace_preservation()
def repair_whitespace_preservation(self) -> int:
repairs = 0
for xml_file in self.xml_files:
try:
content = xml_file.read_text(encoding="utf-8")
dom = defusedxml.minidom.parseString(content)
modified = False
for elem in dom.getElementsByTagName("*"):
if elem.tagName.endswith(":t") and elem.firstChild:
text = elem.firstChild.nodeValue
if text and (text.startswith((" ", "\t")) or text.endswith((" ", "\t"))):
if elem.getAttribute("xml:space") != "preserve":
elem.setAttribute("xml:space", "preserve")
text_preview = (
repr(text[:30]) + "..." if len(text) > 30 else repr(text)
)
print(
f" Repaired: {xml_file.name}: Added xml:space='preserve' to {elem.tagName}: {text_preview}"
)
repairs += 1
modified = True
if modified:
xml_file.write_bytes(dom.toxml(encoding="UTF-8"))
except Exception:
pass
return repairs
def validate_xml(self):
errors = []
for xml_file in self.xml_files:
try:
lxml.etree.parse(str(xml_file))
except lxml.etree.XMLSyntaxError as e:
errors.append(
f" {xml_file.relative_to(self.unpacked_dir)}: Line {e.lineno}: {e.msg}"
)
except Exception as e:
errors.append(
f" {xml_file.relative_to(self.unpacked_dir)}: Unexpected error: {str(e)}"
)
if errors:
print(f"FAILED - Found {len(errors)} XML violations:")
for error in errors:
print(error)
return False
else:
if self.verbose:
print("PASSED - All XML files are well-formed")
return True
def validate_namespaces(self):
errors = []
for xml_file in self.xml_files:
try:
root = lxml.etree.parse(str(xml_file)).getroot()
declared = set(root.nsmap.keys()) - {None}
for attr_val in [v for k, v in root.attrib.items() if k.endswith("Ignorable")]:
undeclared = set(attr_val.split()) - declared
errors.extend(
f" {xml_file.relative_to(self.unpacked_dir)}: "
f"Namespace '{ns}' in Ignorable but not declared"
for ns in undeclared
)
except lxml.etree.XMLSyntaxError:
continue
if errors:
print(f"FAILED - {len(errors)} namespace issues:")
for error in errors:
print(error)
return False
if self.verbose:
print("PASSED - All namespace prefixes properly declared")
return True
def validate_unique_ids(self):
errors = []
global_ids = {}
for xml_file in self.xml_files:
try:
root = lxml.etree.parse(str(xml_file)).getroot()
file_ids = {}
mc_elements = root.xpath(
".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}
)
for elem in mc_elements:
elem.getparent().remove(elem)
for elem in root.iter():
tag = elem.tag.split("}")[-1].lower() if "}" in elem.tag else elem.tag.lower()
if tag in self.UNIQUE_ID_REQUIREMENTS:
in_excluded_container = any(
ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS
for ancestor in elem.iterancestors()
)
if in_excluded_container:
continue
attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]
id_value = None
for attr, value in elem.attrib.items():
attr_local = (
attr.split("}")[-1].lower() if "}" in attr else attr.lower()
)
if attr_local == attr_name:
id_value = value
break
if id_value is not None:
if scope == "global":
if id_value in global_ids:
prev_file, prev_line, prev_tag = global_ids[id_value]
errors.app
…