grok-build/latest/content · 0.2.69
skills/docx/scripts/office/validators/docx.py
Skill15.4 KB141 lines
"""
Validator for Word document XML files against XSD schemas.
"""
import random
import re
import tempfile
import zipfile
import defusedxml.minidom
import lxml.etree
from .base import BaseSchemaValidator
class DOCXSchemaValidator(BaseSchemaValidator):
WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
W14_NAMESPACE = "http://schemas.microsoft.com/office/word/2010/wordml"
W16CID_NAMESPACE = "http://schemas.microsoft.com/office/word/2016/wordml/cid"
ELEMENT_RELATIONSHIP_TYPES = {}
def validate(self):
if not self.validate_xml():
return False
all_valid = True
if not self.validate_namespaces():
all_valid = False
if not self.validate_unique_ids():
all_valid = False
if not self.validate_file_references():
all_valid = False
if not self.validate_content_types():
all_valid = False
if not self.validate_against_xsd():
all_valid = False
if not self.validate_whitespace_preservation():
all_valid = False
if not self.validate_deletions():
all_valid = False
if not self.validate_insertions():
all_valid = False
if not self.validate_all_relationship_ids():
all_valid = False
if not self.validate_id_constraints():
all_valid = False
if not self.validate_comment_markers():
all_valid = False
self.compare_paragraph_counts()
return all_valid
def validate_whitespace_preservation(self):
errors = []
for xml_file in self.xml_files:
if xml_file.name != "document.xml":
continue
try:
root = lxml.etree.parse(str(xml_file)).getroot()
for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"):
if elem.text:
text = elem.text
if re.search(r"^[ \t\n\r]", text) or re.search(r"[ \t\n\r]$", text):
xml_space_attr = f"{{{self.XML_NAMESPACE}}}space"
if (
xml_space_attr not in elem.attrib
or elem.attrib[xml_space_attr] != "preserve"
):
text_preview = (
repr(text)[:50] + "..." if len(repr(text)) > 50 else repr(text)
)
errors.append(
f" {xml_file.relative_to(self.unpacked_dir)}: "
f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}"
)
except (lxml.etree.XMLSyntaxError, Exception) as e:
errors.append(f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}")
if errors:
print(f"FAILED - Found {len(errors)} whitespace preservation violations:")
for error in errors:
print(error)
return False
else:
if self.verbose:
print("PASSED - All whitespace is properly preserved")
return True
def validate_deletions(self):
errors = []
for xml_file in self.xml_files:
if xml_file.name != "document.xml":
continue
try:
root = lxml.etree.parse(str(xml_file)).getroot()
namespaces = {"w": self.WORD_2006_NAMESPACE}
for t_elem in root.xpath(".//w:del//w:t", namespaces=namespaces):
if t_elem.text:
text_preview = (
repr(t_elem.text)[:50] + "..."
if len(repr(t_elem.text)) > 50
else repr(t_elem.text)
)
errors.append(
f" {xml_file.relative_to(self.unpacked_dir)}: "
f"Line {t_elem.sourceline}: <w:t> found within <w:del>: {text_preview}"
)
for instr_elem in root.xpath(".//w:del//w:instrText", namespaces=namespaces):
text_preview = (
repr(instr_elem.text or "")[:50] + "..."
if len(repr(instr_elem.text or "")) > 50
else repr(instr_elem.text or "")
)
errors.append(
f" {xml_file.relative_to(self.unpacked_dir)}: "
f"Line {instr_elem.sourceline}: <w:instrText> found within <w:del> (use <w:delInstrText>): {text_preview}"
)
except (lxml.etree.XMLSyntaxError, Exception) as e:
errors.append(f" {xml_file.relative_to(s
…