forked from ayuspie/VT-PR
163 lines
5.5 KiB
Python
163 lines
5.5 KiB
Python
from __future__ import annotations
|
|
|
|
import html
|
|
import re
|
|
from functools import partial
|
|
from typing import Optional
|
|
|
|
import tinycss
|
|
from srt import Subtitle
|
|
|
|
from subby.converters.base import BaseConverter
|
|
from subby.subripfile import SubRipFile
|
|
from subby.utils.time import timedelta_from_timestamp
|
|
|
|
HTML_TAG = re.compile(r'</?(?!/?i)[^>\s]+>')
|
|
STYLE_TAG_OPEN = re.compile(r'^<c.([a-zA-Z0-9]+)>([^<]+)')
|
|
STYLE_TAG = re.compile(r'<c.([a-zA-Z0-9]+)>([^<]+)<\/c>')
|
|
STYLE_TAG_CLOSE = re.compile(r'<\/c>$')
|
|
SKIP_WORDS = ('WEBVTT', 'NOTE', '/*', 'X-TIMESTAMP-MAP')
|
|
SPEAKER_TAG = re.compile(r'<v\s+[^>]+>') # Matches opening <v Name> tags, closing tags handled by STYLE_TAG_CLOSE
|
|
|
|
|
|
class WebVTTConverter(BaseConverter):
|
|
"""WebVTT subtitle converter"""
|
|
|
|
def parse(self, stream):
|
|
srt = SubRipFile()
|
|
looking_for_text = False
|
|
looking_for_style = False
|
|
text = []
|
|
position = None
|
|
line_number = 1
|
|
styles = {}
|
|
current_style = []
|
|
|
|
css_parser = tinycss.make_parser('page3')
|
|
|
|
for line in stream:
|
|
# As our stream is bytes we have to deal with line breaks here
|
|
line = line.decode('utf-8').replace('\r\n', '\n').replace('\r', '\n').strip()
|
|
|
|
# Skip processing any unnecessary lines
|
|
if any(line.startswith(word) for word in SKIP_WORDS):
|
|
continue
|
|
|
|
# Empty line separates cues
|
|
if line == '':
|
|
# Parse current style
|
|
if looking_for_style:
|
|
stylesheet = css_parser.parse_stylesheet('\n'.join(current_style))
|
|
for rule in stylesheet.rules:
|
|
ft = next((e for e in rule.selector if e.type == 'FUNCTION'), None)
|
|
if not ft:
|
|
continue
|
|
name = next((t for t in ft.content if t.type == 'IDENT'), None)
|
|
if not name:
|
|
continue
|
|
styles[name.value] = {}
|
|
for dec in rule.declarations:
|
|
styles[name.value][dec.name] = dec.value.as_css()
|
|
|
|
looking_for_style = False
|
|
|
|
# Keep looking for text if last line has none
|
|
# this will only happen if there's an unexpected line break
|
|
if not text:
|
|
continue
|
|
|
|
srt[-1].content = '\n'.join(text)
|
|
text = []
|
|
looking_for_text = False
|
|
|
|
# Check for style start
|
|
elif 'STYLE' in line:
|
|
looking_for_style = True
|
|
|
|
# Check for style content
|
|
elif looking_for_style:
|
|
current_style.append(line)
|
|
|
|
# Check for time line
|
|
elif ' --> ' in line:
|
|
parts = line.strip().split()
|
|
position = self._get_position([p for p in parts[3:] if ':' in p])
|
|
|
|
start, _, end, *_ = parts
|
|
# Fix short timecodes (no hour)
|
|
if start.count(':') == 1:
|
|
start = f'00:{start}'
|
|
if end.count(':') == 1:
|
|
end = f'00:{end}'
|
|
|
|
srt.append(Subtitle(
|
|
index=line_number,
|
|
start=timedelta_from_timestamp(start),
|
|
end=timedelta_from_timestamp(end),
|
|
content=''
|
|
))
|
|
looking_for_text = True
|
|
line_number += 1
|
|
|
|
# Append text if we're inside a line
|
|
elif looking_for_text:
|
|
# Unescape html entities
|
|
line = html.unescape(line)
|
|
|
|
# Remove speaker tags here
|
|
line = re.sub(SPEAKER_TAG, '', line)
|
|
|
|
# Set \an8 tag if position is below 25
|
|
# (value taken from SubtitleEdit)
|
|
if position is not None and position < 25:
|
|
line = '{\\an8}' + line
|
|
position = None
|
|
|
|
text.append(line.strip())
|
|
|
|
# Add any leftover text to the last line
|
|
if text:
|
|
srt[-1].content += '\n'.join(text)
|
|
|
|
for line in srt:
|
|
# Replace styles with italics tag when appropriate
|
|
# (replace instead of match, to handle nested)
|
|
line.content = re.sub(
|
|
STYLE_TAG,
|
|
partial(self._replace_italics, styles=styles),
|
|
line.content
|
|
)
|
|
|
|
# Strip non-italic tags
|
|
line.content = re.sub(HTML_TAG, '', line.content)
|
|
|
|
return srt
|
|
|
|
@staticmethod
|
|
def _get_position(cue_settings: list[str]) -> Optional[float]:
|
|
"""
|
|
Parses list of cue settings and extracts position offset as a float
|
|
Line number based offset and alignment strings are ignored
|
|
|
|
https://www.w3.org/TR/webvtt1/#webvtt-line-cue-setting
|
|
"""
|
|
if not cue_settings or cue_settings == ['None']:
|
|
return None
|
|
|
|
position = None
|
|
for key, val in (pos.split(':') for pos in cue_settings):
|
|
if key == 'line' and val and (val := val.split(',')[0])[-1] == '%':
|
|
position = float(val[:-1])
|
|
break
|
|
elif key == 'line' and val and val == '0':
|
|
position = 0
|
|
break
|
|
|
|
return position
|
|
|
|
@staticmethod
|
|
def _replace_italics(match: re.Match, styles: dict[str, dict[str, str]]) -> str:
|
|
if (s := styles.get(match[1])) and s.get('font-style') == 'italic':
|
|
return f'<i>{match[2]}</i>'
|
|
return match[0]
|