grok-build/latest/content · 0.2.69
skills/pptx/scripts/office/validators/redlining.py
Skill8.6 KB158 lines
"""
Validator for tracked changes in Word documents.
"""
import subprocess
import tempfile
import zipfile
from pathlib import Path
class RedliningValidator:
def __init__(self, unpacked_dir, original_docx, verbose=False, author="Claude"):
self.unpacked_dir = Path(unpacked_dir)
self.original_docx = Path(original_docx)
self.verbose = verbose
self.author = author
self.namespaces = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
def repair(self) -> int:
return 0
def validate(self):
modified_file = self.unpacked_dir / "word" / "document.xml"
if not modified_file.exists():
print(f"FAILED - Modified document.xml not found at {modified_file}")
return False
try:
import xml.etree.ElementTree as ET
tree = ET.parse(modified_file)
root = tree.getroot()
del_elements = root.findall(".//w:del", self.namespaces)
ins_elements = root.findall(".//w:ins", self.namespaces)
author_del_elements = [
elem
for elem in del_elements
if elem.get(f"{{{self.namespaces['w']}}}author") == self.author
]
author_ins_elements = [
elem
for elem in ins_elements
if elem.get(f"{{{self.namespaces['w']}}}author") == self.author
]
if not author_del_elements and not author_ins_elements:
if self.verbose:
print(f"PASSED - No tracked changes by {self.author} found.")
return True
except Exception:
pass
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
try:
with zipfile.ZipFile(self.original_docx, "r") as zip_ref:
zip_ref.extractall(temp_path)
except Exception as e:
print(f"FAILED - Error unpacking original docx: {e}")
return False
original_file = temp_path / "word" / "document.xml"
if not original_file.exists():
print(f"FAILED - Original document.xml not found in {self.original_docx}")
return False
try:
import xml.etree.ElementTree as ET
modified_tree = ET.parse(modified_file)
modified_root = modified_tree.getroot()
original_tree = ET.parse(original_file)
original_root = original_tree.getroot()
except ET.ParseError as e:
print(f"FAILED - Error parsing XML files: {e}")
return False
self._remove_author_tracked_changes(original_root)
self._remove_author_tracked_changes(modified_root)
modified_text = self._extract_text_content(modified_root)
original_text = self._extract_text_content(original_root)
if modified_text != original_text:
error_message = self._generate_detailed_diff(original_text, modified_text)
print(error_message)
return False
if self.verbose:
print(f"PASSED - All changes by {self.author} are properly tracked")
return True
def _generate_detailed_diff(self, original_text, modified_text):
error_parts = [
f"FAILED - Document text doesn't match after removing {self.author}'s tracked changes",
"",
"Likely causes:",
" 1. Modified text inside another author's <w:ins> or <w:del> tags",
" 2. Made edits without proper tracked changes",
" 3. Didn't nest <w:del> inside <w:ins> when deleting another's insertion",
"",
"For pre-redlined documents, use correct patterns:",
" - To reject another's INSERTION: Nest <w:del> inside their <w:ins>",
" - To restore another's DELETION: Add new <w:ins> AFTER their <w:del>",
"",
]
git_diff = self._get_git_word_diff(original_text, modified_text)
if git_diff:
error_parts.extend(["Differences:", "============", git_diff])
else:
error_parts.append("Unable to generate word diff (git not available)")
return "\n".join(error_parts)
def _get_git_word_diff(self, original_text, modified_text):
try:
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
original_file = temp_path / "original.txt"
modified_file = temp_path / "modified.txt"
original_file.write_text(original_text, encoding="utf-8")
modified_file.write_text(modified_text, encoding="utf-8")
result = subprocess.run(
[
"git",
"diff",
"--word-diff=plain",
"--word-diff-regex=.",
"-U0",
"--no-index",
str(original_file),
str(modified_file),
],
capture_output=True,
text=True,
)
if result.stdout.strip():
lines = result.stdout.split("\n")
content_lines = []
in_content = False
for line in lines:
if line.startswith("@@"):
in_content = True
continue
if in_content and line.strip():
content_lines.append(line)
if content_li
…