64 lines
		
	
	
		
			1.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			64 lines
		
	
	
		
			1.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
#!/usr/bin/env python3
 | 
						||
# -*- coding: utf-8 -*-
 | 
						||
"""
 | 
						||
Convierte el TTML/XML (archivo .srt en este repositorio) a texto plano
 | 
						||
agruparndo cada <p> como un párrafo y juntando los <s> en una sola línea.
 | 
						||
Guarda el resultado en el mismo directorio con extensión .txt
 | 
						||
"""
 | 
						||
import sys
 | 
						||
from pathlib import Path
 | 
						||
import re
 | 
						||
from xml.etree import ElementTree as ET
 | 
						||
 | 
						||
in_path = Path('/media/kyman/SSD2TB/git.copilot/OpoTests/Public/data/srt/Real Decreto Legislativo 5⁄2015, TREBEP - Art 1 a 7 - 1a parte. (1080p_30fps_H264-128kbit_AAC).español.srt')
 | 
						||
if not in_path.exists():
 | 
						||
    print('ERROR: input file not found:', in_path)
 | 
						||
    sys.exit(2)
 | 
						||
 | 
						||
out_path = in_path.with_suffix('.txt')
 | 
						||
 | 
						||
try:
 | 
						||
    tree = ET.parse(in_path)
 | 
						||
    root = tree.getroot()
 | 
						||
except Exception as e:
 | 
						||
    # try to recover by reading text and parsing from <body>
 | 
						||
    txt = in_path.read_text(encoding='utf-8')
 | 
						||
    idx = txt.find('<body>')
 | 
						||
    if idx != -1:
 | 
						||
        txt = txt[idx:]
 | 
						||
        root = ET.fromstring('<root>' + txt + '</root>')
 | 
						||
    else:
 | 
						||
        raise
 | 
						||
 | 
						||
paragraphs = []
 | 
						||
for p in root.findall('.//p'):
 | 
						||
    parts = []
 | 
						||
    for s in p.findall('.//s'):
 | 
						||
        t = s.text or ''
 | 
						||
        t = t.strip()
 | 
						||
        if t:
 | 
						||
            parts.append(t)
 | 
						||
    if not parts:
 | 
						||
        # fallback to any text content
 | 
						||
        text = ''.join(p.itertext()).strip()
 | 
						||
        if text:
 | 
						||
            parts = [text]
 | 
						||
    para = ' '.join(parts)
 | 
						||
    # normalize whitespace
 | 
						||
    para = re.sub(r"\s+", ' ', para).strip()
 | 
						||
    if para:
 | 
						||
        paragraphs.append(para)
 | 
						||
 | 
						||
full_text = '\n\n'.join(paragraphs)
 | 
						||
full_text = re.sub(r'<[^>]+>', '', full_text)
 | 
						||
 | 
						||
out_path.write_text(full_text, encoding='utf-8')
 | 
						||
print(f'WROTE: {out_path}')
 | 
						||
print(f'PARAGRAPHS: {len(paragraphs)}')
 | 
						||
# print a short preview to confirm
 | 
						||
preview = full_text[:4000]
 | 
						||
print('\n---PREVIEW---\n')
 | 
						||
print(preview)
 | 
						||
print('\n---END PREVIEW---\n')
 | 
						||
print('Done')
 |