OpoTests/Tools/convert_tt_to_txt.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Convierte el TTML/XML (archivo .srt en este repositorio) a texto plano
agruparndo cada <p> como un párrafo y juntando los <s> en una sola línea.
Guarda el resultado en el mismo directorio con extensión .txt
"""
import sys
from pathlib import Path
import re
from xml.etree import ElementTree as ET

in_path = Path('/media/kyman/SSD2TB/git.copilot/OpoTests/Public/data/srt/Real Decreto Legislativo 5⁄2015, TREBEP - Art 1 a 7 - 1a parte. (1080p_30fps_H264-128kbit_AAC).español.srt')
if not in_path.exists():
    print('ERROR: input file not found:', in_path)
    sys.exit(2)

out_path = in_path.with_suffix('.txt')

try:
    tree = ET.parse(in_path)
    root = tree.getroot()
except Exception as e:
    # try to recover by reading text and parsing from <body>
    txt = in_path.read_text(encoding='utf-8')
    idx = txt.find('<body>')
    if idx != -1:
        txt = txt[idx:]
        root = ET.fromstring('<root>' + txt + '</root>')
    else:
        raise

paragraphs = []
for p in root.findall('.//p'):
    parts = []
    for s in p.findall('.//s'):
        t = s.text or ''
        t = t.strip()
        if t:
            parts.append(t)
    if not parts:
        # fallback to any text content
        text = ''.join(p.itertext()).strip()
        if text:
            parts = [text]
    para = ' '.join(parts)
    # normalize whitespace
    para = re.sub(r"\s+", ' ', para).strip()
    if para:
        paragraphs.append(para)

full_text = '\n\n'.join(paragraphs)
full_text = re.sub(r'<[^>]+>', '', full_text)

out_path.write_text(full_text, encoding='utf-8')
print(f'WROTE: {out_path}')
print(f'PARAGRAPHS: {len(paragraphs)}')
# print a short preview to confirm
preview = full_text[:4000]
print('\n---PREVIEW---\n')
print(preview)
print('\n---END PREVIEW---\n')
print('Done')