VT-PR/scripts/subby/subby/converters/webvtt.py
2025-04-14 08:39:57 +05:30

163 lines
5.5 KiB
Python

from __future__ import annotations
import html
import re
from functools import partial
from typing import Optional
import tinycss
from srt import Subtitle
from subby.converters.base import BaseConverter
from subby.subripfile import SubRipFile
from subby.utils.time import timedelta_from_timestamp
HTML_TAG = re.compile(r'</?(?!/?i)[^>\s]+>')
STYLE_TAG_OPEN = re.compile(r'^<c.([a-zA-Z0-9]+)>([^<]+)')
STYLE_TAG = re.compile(r'<c.([a-zA-Z0-9]+)>([^<]+)<\/c>')
STYLE_TAG_CLOSE = re.compile(r'<\/c>$')
SKIP_WORDS = ('WEBVTT', 'NOTE', '/*', 'X-TIMESTAMP-MAP')
SPEAKER_TAG = re.compile(r'<v\s+[^>]+>') # Matches opening <v Name> tags, closing tags handled by STYLE_TAG_CLOSE
class WebVTTConverter(BaseConverter):
"""WebVTT subtitle converter"""
def parse(self, stream):
srt = SubRipFile()
looking_for_text = False
looking_for_style = False
text = []
position = None
line_number = 1
styles = {}
current_style = []
css_parser = tinycss.make_parser('page3')
for line in stream:
# As our stream is bytes we have to deal with line breaks here
line = line.decode('utf-8').replace('\r\n', '\n').replace('\r', '\n').strip()
# Skip processing any unnecessary lines
if any(line.startswith(word) for word in SKIP_WORDS):
continue
# Empty line separates cues
if line == '':
# Parse current style
if looking_for_style:
stylesheet = css_parser.parse_stylesheet('\n'.join(current_style))
for rule in stylesheet.rules:
ft = next((e for e in rule.selector if e.type == 'FUNCTION'), None)
if not ft:
continue
name = next((t for t in ft.content if t.type == 'IDENT'), None)
if not name:
continue
styles[name.value] = {}
for dec in rule.declarations:
styles[name.value][dec.name] = dec.value.as_css()
looking_for_style = False
# Keep looking for text if last line has none
# this will only happen if there's an unexpected line break
if not text:
continue
srt[-1].content = '\n'.join(text)
text = []
looking_for_text = False
# Check for style start
elif 'STYLE' in line:
looking_for_style = True
# Check for style content
elif looking_for_style:
current_style.append(line)
# Check for time line
elif ' --> ' in line:
parts = line.strip().split()
position = self._get_position([p for p in parts[3:] if ':' in p])
start, _, end, *_ = parts
# Fix short timecodes (no hour)
if start.count(':') == 1:
start = f'00:{start}'
if end.count(':') == 1:
end = f'00:{end}'
srt.append(Subtitle(
index=line_number,
start=timedelta_from_timestamp(start),
end=timedelta_from_timestamp(end),
content=''
))
looking_for_text = True
line_number += 1
# Append text if we're inside a line
elif looking_for_text:
# Unescape html entities
line = html.unescape(line)
# Remove speaker tags here
line = re.sub(SPEAKER_TAG, '', line)
# Set \an8 tag if position is below 25
# (value taken from SubtitleEdit)
if position is not None and position < 25:
line = '{\\an8}' + line
position = None
text.append(line.strip())
# Add any leftover text to the last line
if text:
srt[-1].content += '\n'.join(text)
for line in srt:
# Replace styles with italics tag when appropriate
# (replace instead of match, to handle nested)
line.content = re.sub(
STYLE_TAG,
partial(self._replace_italics, styles=styles),
line.content
)
# Strip non-italic tags
line.content = re.sub(HTML_TAG, '', line.content)
return srt
@staticmethod
def _get_position(cue_settings: list[str]) -> Optional[float]:
"""
Parses list of cue settings and extracts position offset as a float
Line number based offset and alignment strings are ignored
https://www.w3.org/TR/webvtt1/#webvtt-line-cue-setting
"""
if not cue_settings or cue_settings == ['None']:
return None
position = None
for key, val in (pos.split(':') for pos in cue_settings):
if key == 'line' and val and (val := val.split(',')[0])[-1] == '%':
position = float(val[:-1])
break
elif key == 'line' and val and val == '0':
position = 0
break
return position
@staticmethod
def _replace_italics(match: re.Match, styles: dict[str, dict[str, str]]) -> str:
if (s := styles.get(match[1])) and s.get('font-style') == 'italic':
return f'<i>{match[2]}</i>'
return match[0]