"""Remove unreferenced files from an unpacked PPTX directory. Usage: python clean.py Example: python clean.py unpacked/ This script removes: - Orphaned slides (not in sldIdLst) and their relationships - [trash] directory (unreferenced files) - Orphaned .rels files for deleted resources - Unreferenced media, embeddings, charts, diagrams, drawings, ink files - Unreferenced theme files - Unreferenced notes slides - Content-Type overrides for deleted files """ import sys from pathlib import Path import defusedxml.minidom import re def get_slides_in_sldidlst(unpacked_dir: Path) -> set[str]: pres_path = unpacked_dir / "ppt" / "presentation.xml" pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels" if not pres_path.exists() or not pres_rels_path.exists(): return set() rels_dom = defusedxml.minidom.parse(str(pres_rels_path)) rid_to_slide = {} for rel in rels_dom.getElementsByTagName("Relationship"): rid = rel.getAttribute("Id") target = rel.getAttribute("Target") rel_type = rel.getAttribute("Type") if "slide" in rel_type and target.startswith("slides/"): rid_to_slide[rid] = target.replace("slides/", "") pres_content = pres_path.read_text(encoding="utf-8") referenced_rids = set(re.findall(r']*r:id="([^"]+)"', pres_content)) return {rid_to_slide[rid] for rid in referenced_rids if rid in rid_to_slide} def remove_orphaned_slides(unpacked_dir: Path) -> list[str]: slides_dir = unpacked_dir / "ppt" / "slides" slides_rels_dir = slides_dir / "_rels" pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels" if not slides_dir.exists(): return [] referenced_slides = get_slides_in_sldidlst(unpacked_dir) removed = [] for slide_file in slides_dir.glob("slide*.xml"): if slide_file.name not in referenced_slides: rel_path = slide_file.relative_to(unpacked_dir) slide_file.unlink() removed.append(str(rel_path)) rels_file = slides_rels_dir / f"{slide_file.name}.rels" if rels_file.exists(): rels_file.unlink() removed.append(str(rels_file.relative_to(unpacked_dir))) if removed and pres_rels_path.exists(): rels_dom = defusedxml.minidom.parse(str(pres_rels_path)) changed = False for rel in list(rels_dom.getElementsByTagName("Relationship")): target = rel.getAttribute("Target") if target.startswith("slides/"): slide_name = target.replace("slides/", "") if slide_name not in referenced_slides: if rel.parentNode: rel.parentNode.removeChild(rel) changed = True if changed: with open(pres_rels_path, "wb") as f: f.write(rels_dom.toxml(encoding="utf-8")) return removed def remove_trash_directory(unpacked_dir: Path) -> list[str]: trash_dir = unpacked_dir / "[trash]" removed = [] if trash_dir.exists() and trash_dir.is_dir(): for file_path in trash_dir.iterdir(): if file_path.is_file(): rel_path = file_path.relative_to(unpacked_dir) removed.append(str(rel_path)) file_path.unlink() trash_dir.rmdir() return removed def get_slide_referenced_files(unpacked_dir: Path) -> set: referenced = set() slides_rels_dir = unpacked_dir / "ppt" / "slides" / "_rels" if not slides_rels_dir.exists(): return referenced for rels_file in slides_rels_dir.glob("*.rels"): dom = defusedxml.minidom.parse(str(rels_file)) for rel in dom.getElementsByTagName("Relationship"): target = rel.getAttribute("Target") if not target: continue target_path = (rels_file.parent.parent / target).resolve() try: referenced.add(target_path.relative_to(unpacked_dir.resolve())) except ValueError: pass return referenced def remove_orphaned_rels_files(unpacked_dir: Path) -> list[str]: resource_dirs = ["charts", "diagrams", "drawings"] removed = [] slide_referenced = get_slide_referenced_files(unpacked_dir) for dir_name in resource_dirs: rels_dir = unpacked_dir / "ppt" / dir_name / "_rels" if not rels_dir.exists(): continue for rels_file in rels_dir.glob("*.rels"): resource_file = rels_dir.parent / rels_file.name.replace(".rels", "") try: resource_rel_path = resource_file.resolve().relative_to(unpacked_dir.resolve()) except ValueError: continue if not resource_file.exists() or resource_rel_path not in slide_referenced: rels_file.unlink() rel_path = rels_file.relative_to(unpacked_dir) removed.append(str(rel_path)) return removed def get_referenced_files(unpacked_dir: Path) -> set: referenced = set() for rels_file in unpacked_dir.rglob("*.rels"): dom = defusedxml.minidom.parse(str(rels_file)) for rel in dom.getElementsByTagName("Relationship"): target = rel.getAttribute("Target") if not target: continue target_path = (rels_file.parent.parent / target).resolve() try: referenced.add(target_path.relative_to(unpacked_dir.resolve())) except ValueError: pass return referenced def remove_orphaned_files(unpacked_dir: Path, referenced: set) -> list[str]: resource_dirs = ["media", "embeddings", "charts", "diagrams", "tags", "drawings", "ink"] removed = [] for dir_name in resource_dirs: dir_path = unpacked_dir / "ppt" / dir_name if not dir_path.exists(): continue for file_path in dir_path.glob("*"): if not file_path.is_file(): continue rel_path = file_path.relative_to(unpacked_dir) if rel_path not in referenced: file_path.unlink() removed.append(str(rel_path)) theme_dir = unpacked_dir / "ppt" / "theme" if theme_dir.exists(): for file_path in theme_dir.glob("theme*.xml"): rel_path = file_path.relative_to(unpacked_dir) if rel_path not in referenced: file_path.unlink() removed.append(str(rel_path)) theme_rels = theme_dir / "_rels" / f"{file_path.name}.rels" if theme_rels.exists(): theme_rels.unlink() removed.append(str(theme_rels.relative_to(unpacked_dir))) notes_dir = unpacked_dir / "ppt" / "notesSlides" if notes_dir.exists(): for file_path in notes_dir.glob("*.xml"): if not file_path.is_file(): continue rel_path = file_path.relative_to(unpacked_dir) if rel_path not in referenced: file_path.unlink() removed.append(str(rel_path)) notes_rels_dir = notes_dir / "_rels" if notes_rels_dir.exists(): for file_path in notes_rels_dir.glob("*.rels"): notes_file = notes_dir / file_path.name.replace(".rels", "") if not notes_file.exists(): file_path.unlink() removed.append(str(file_path.relative_to(unpacked_dir))) return removed def update_content_types(unpacked_dir: Path, removed_files: list[str]) -> None: ct_path = unpacked_dir / "[Content_Types].xml" if not ct_path.exists(): return dom = defusedxml.minidom.parse(str(ct_path)) changed = False for override in list(dom.getElementsByTagName("Override")): part_name = override.getAttribute("PartName").lstrip("/") if part_name in removed_files: if override.parentNode: override.parentNode.removeChild(override) changed = True if changed: with open(ct_path, "wb") as f: f.write(dom.toxml(encoding="utf-8")) def clean_unused_files(unpacked_dir: Path) -> list[str]: all_removed = [] slides_removed = remove_orphaned_slides(unpacked_dir) all_removed.extend(slides_removed) trash_removed = remove_trash_directory(unpacked_dir) all_removed.extend(trash_removed) while True: removed_rels = remove_orphaned_rels_files(unpacked_dir) referenced = get_referenced_files(unpacked_dir) removed_files = remove_orphaned_files(unpacked_dir, referenced) total_removed = removed_rels + removed_files if not total_removed: break all_removed.extend(total_removed) if all_removed: update_content_types(unpacked_dir, all_removed) return all_removed if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python clean.py ", file=sys.stderr) print("Example: python clean.py unpacked/", file=sys.stderr) sys.exit(1) unpacked_dir = Path(sys.argv[1]) if not unpacked_dir.exists(): print(f"Error: {unpacked_dir} not found", file=sys.stderr) sys.exit(1) removed = clean_unused_files(unpacked_dir) if removed: print(f"Removed {len(removed)} unreferenced files:") for f in removed: print(f" {f}") else: print("No unreferenced files found")