grok-build/latest/content · 0.2.69
skills/pptx/scripts/office/validators/base.py
Skill30.9 KB148 lines
"""
Base validator with common validation logic for document files.
"""
import re
from pathlib import Path
import defusedxml.minidom
import lxml.etree
class BaseSchemaValidator:
IGNORED_VALIDATION_ERRORS = [
"hyphenationZone",
"purl.org/dc/terms",
]
UNIQUE_ID_REQUIREMENTS = {
"comment": ("id", "file"),
"commentrangestart": ("id", "file"),
"commentrangeend": ("id", "file"),
"bookmarkstart": ("id", "file"),
"bookmarkend": ("id", "file"),
"sldid": ("id", "file"),
"sldmasterid": ("id", "global"),
"sldlayoutid": ("id", "global"),
"cm": ("authorid", "file"),
"sheet": ("sheetid", "file"),
"definedname": ("id", "file"),
"cxnsp": ("id", "file"),
"sp": ("id", "file"),
"pic": ("id", "file"),
"grpsp": ("id", "file"),
}
EXCLUDED_ID_CONTAINERS = {
"sectionlst",
}
ELEMENT_RELATIONSHIP_TYPES = {}
SCHEMA_MAPPINGS = {
"word": "ISO-IEC29500-4_2016/wml.xsd",
"ppt": "ISO-IEC29500-4_2016/pml.xsd",
"xl": "ISO-IEC29500-4_2016/sml.xsd",
"[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
"app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
"core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
"custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
".rels": "ecma/fouth-edition/opc-relationships.xsd",
"people.xml": "microsoft/wml-2012.xsd",
"commentsIds.xml": "microsoft/wml-cid-2016.xsd",
"commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
"commentsExtended.xml": "microsoft/wml-2012.xsd",
"chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
"theme": "ISO-IEC29500-4_2016/dml-main.xsd",
"drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
}
MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
PACKAGE_RELATIONSHIPS_NAMESPACE = "http://schemas.openxmlformats.org/package/2006/relationships"
OFFICE_RELATIONSHIPS_NAMESPACE = (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships"
)
CONTENT_TYPES_NAMESPACE = "http://schemas.openxmlformats.org/package/2006/content-types"
MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
OOXML_NAMESPACES = {
"http://schemas.openxmlformats.org/officeDocument/2006/math",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"http://schemas.openxmlformats.org/schemaLibrary/2006/main",
"http://schemas.openxmlformats.org/drawingml/2006/main",
"http://schemas.openxmlformats.org/drawingml/2006/chart",
"http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",
"http://schemas.openxmlformats.org/drawingml/2006/diagram",
"http://schemas.openxmlformats.org/drawingml/2006/picture",
"http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
"http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"http://schemas.openxmlformats.org/presentationml/2006/main",
"http://schemas.openxmlformats.org/spreadsheetml/2006/main",
"http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",
"http://www.w3.org/XML/1998/namespace",
}
def __init__(self, unpacked_dir, original_file=None, verbose=False):
self.unpacked_dir = Path(unpacked_dir).resolve()
self.original_file = Path(original_file) if original_file else None
self.verbose = verbose
self.schemas_dir = Path(__file__).parent.parent / "schemas"
patterns = ["*.xml", "*.rels"]
self.xml_files = [f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)]
if not self.xml_files:
print(f"Warning: No XML files found in {self.unpacked_dir}")
def validate(self):
raise NotImplementedError("Subclasses must implement the validate method")
def repair(self) -> int:
return self.repair_whitespace_preservation()
def repair_whitespace_preservation(self) -> int:
repairs = 0
for xml_file in self.xml_files:
try:
content = xml_file.read_text(encoding="utf-8")
dom = defusedxml.minidom.parseString(content)
modified = False
for elem in dom.getElementsByTagName("*"):
if elem.tagName.endswith(":t") and elem.firstChild:
text = elem.firstChild.nodeValue
if text and (text.startswith((" ", "\t")) or text.endswith((" ", "\t"))):
if elem.getAttribute("xml:space") != "preserve":
elem.setAttribute("xml:space", "preserve")
text_preview = (
repr(text[:30]) + "..." if len(text) > 30 else repr(text)
)
print(
f" Repaired: {xml_file.name}: Added xml:space='preserve' to {elem.tagName}: {text_preview}"
)
repairs += 1
modified = True
if modified:
xml_file.write_bytes(dom.toxml(encoding="UTF-8"))
except Exception:
pass
return repairs
def validate_xml(self):
errors = []
for xml_file in self.xml_files:
try:
lxml.etree.parse(str(xml_file))
except lxml.etree.XMLSyntaxError as e:
err
…