#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Convierte el TTML/XML (archivo .srt en este repositorio) a texto plano agruparndo cada

como un párrafo y juntando los en una sola línea. Guarda el resultado en el mismo directorio con extensión .txt """ import sys from pathlib import Path import re from xml.etree import ElementTree as ET in_path = Path('/media/kyman/SSD2TB/git.copilot/OpoTests/Public/data/srt/Real Decreto Legislativo 5⁄2015, TREBEP - Art 1 a 7 - 1a parte. (1080p_30fps_H264-128kbit_AAC).español.srt') if not in_path.exists(): print('ERROR: input file not found:', in_path) sys.exit(2) out_path = in_path.with_suffix('.txt') try: tree = ET.parse(in_path) root = tree.getroot() except Exception as e: # try to recover by reading text and parsing from txt = in_path.read_text(encoding='utf-8') idx = txt.find('') if idx != -1: txt = txt[idx:] root = ET.fromstring('' + txt + '') else: raise paragraphs = [] for p in root.findall('.//p'): parts = [] for s in p.findall('.//s'): t = s.text or '' t = t.strip() if t: parts.append(t) if not parts: # fallback to any text content text = ''.join(p.itertext()).strip() if text: parts = [text] para = ' '.join(parts) # normalize whitespace para = re.sub(r"\s+", ' ', para).strip() if para: paragraphs.append(para) full_text = '\n\n'.join(paragraphs) full_text = re.sub(r'<[^>]+>', '', full_text) out_path.write_text(full_text, encoding='utf-8') print(f'WROTE: {out_path}') print(f'PARAGRAPHS: {len(paragraphs)}') # print a short preview to confirm preview = full_text[:4000] print('\n---PREVIEW---\n') print(preview) print('\n---END PREVIEW---\n') print('Done')