forked from ayuspie/VT-PR
279 lines
12 KiB
Python
279 lines
12 KiB
Python
import copy
|
||
import datetime
|
||
import html
|
||
import re
|
||
import unicodedata
|
||
from datetime import timedelta
|
||
|
||
import langcodes
|
||
|
||
from subby import regex as Regex
|
||
from subby.processors.base import BaseProcessor
|
||
from subby.processors.rtl import RTL_LANGUAGES, RTLFixer
|
||
from subby.subripfile import SubRipFile
|
||
from subby.utils.time import line_duration
|
||
|
||
|
||
class CommonIssuesFixer(BaseProcessor):
|
||
"""Processor fixing common issues found in subtitles"""
|
||
|
||
remove_gaps = True
|
||
|
||
def process(self, srt, language=None):
|
||
fixed = self._fix_time_codes(copy.deepcopy(srt))
|
||
corrected = self._correct_subtitles(fixed)
|
||
|
||
if language and langcodes.get(language).language in RTL_LANGUAGES:
|
||
corrected, _ = RTLFixer().process(corrected, language=language)
|
||
|
||
return corrected, corrected != srt
|
||
|
||
def _correct_subtitles(self, srt: SubRipFile) -> SubRipFile:
|
||
def _fix_line(line):
|
||
# [GENERAL] - Affects other regexes
|
||
# Remove more than one space
|
||
line = re.sub(r' {2,}', ' ', line)
|
||
# Correct lines starting with space
|
||
line = re.sub(r'^\s*', '', line)
|
||
line = re.sub(r'\n\s*', '\n', line)
|
||
#
|
||
# [ENCODING FIXES, CHARACTER REPLACEMENTS]
|
||
# Fix musical notes garbled by encoding
|
||
# has to happen before normalization as that replaces the TM char
|
||
line = line.replace(r'♪', '♪')
|
||
# Normalize unicode characters
|
||
line = unicodedata.normalize('NFKC', line)
|
||
# Replace short hyphen with regular size
|
||
line = line.replace(r'‐', r'-')
|
||
# Replace double note with single note
|
||
line = line.replace(r'♫', r'♪')
|
||
# Replace hashes, asterisks at the start of a line with a musical note
|
||
line = re.sub(
|
||
r'^((?:{\\an8})?(?:<i>)?)(- ?)?[#\*]{1,}(?=\s+)',
|
||
r'\1\2♪',
|
||
line,
|
||
flags=re.M
|
||
)
|
||
# Replace hashes, asterisks at the end of a line with a musical note
|
||
line = re.sub(
|
||
r'(?<=\s)(?<![#\*])(?:[#\*]{1,3}|[#\*]{1,3})(?![0-9A-Z])(</i>$|$)',
|
||
r'♪\1',
|
||
line,
|
||
flags=re.M
|
||
)
|
||
line = re.sub(r'^[#\*]+$', r'♪', line, flags=re.M)
|
||
# Move notes into italics, if rest of the line is
|
||
line = re.sub(r'♪ <i>(.*)', r'<i>♪ \1', line)
|
||
line = re.sub(r'(♪.*)</i>\s*♪', r'\1 ♪</i>', line)
|
||
# Replace some pound signs with notes (Binge...)
|
||
# (Matches only start/end of a line with a space
|
||
# to avoid false positives)
|
||
line = re.sub(r'^£ ', r'♪ ', line)
|
||
line = re.sub(r' £$', r' ♪', line)
|
||
# Duplicated notes
|
||
line = re.sub(r'♪{1,}', r'♪', line)
|
||
# Add spaces between notes and text
|
||
line = re.sub(r'^♪([A-Za-z])', r'♪ \1', line)
|
||
line = re.sub(r'([A-Za-z])♪', r'\1 ♪', line)
|
||
# Replace \h (non-breaking space in ASS) with a regular space
|
||
# (result of ffmpeg extraction of mp4-embedded subtitles)
|
||
line = re.sub(r'(\\h)+', ' ', line).strip()
|
||
# Fix leftover amps (html unescape fixes those, but not when they're duped)
|
||
line = re.sub(r'&(amp;){1,}', r'&', line)
|
||
# Fix "it'`s" -> "it's"
|
||
line = re.sub(r"'[`’]", r"'", line)
|
||
|
||
# [TAG STRIPPING AND CORRECTING]
|
||
#
|
||
# Replace ASS positioning tags with top only
|
||
line = re.sub(r'(\{\\an[0-9]\}){1,}', r'{\\an8}', line)
|
||
# Remove space after ASS positioning tags
|
||
line = re.sub(r'(\{\\an[0-9]\}) +(?=[A-Za-z-])', r'{\\an8}', line)
|
||
# Fix hanging tags
|
||
line = re.sub(r'^(<[a-z]>)\n', r'\1', line)
|
||
line = re.sub(r'</([a-z])>$\n<([a-z])>', r'\n', line, flags=re.M)
|
||
# Remove duplicated tags
|
||
line = re.sub(r'(<[a-z]>){1,}', r'\1', line)
|
||
line = re.sub(r'(</[a-z]>){1,}', r'\1', line)
|
||
# Remove an unnecessary space after italic tag open
|
||
line = re.sub(r'^(<[a-z]>) {1,}', r'\1', line)
|
||
line = re.sub(r'^ {1,}', '', line)
|
||
# Remove non-italic tags
|
||
line = re.sub(r'</?(?!i>)[a-z]+>', '', line)
|
||
# Remove spaces between tags
|
||
line = re.sub(r'(<[a-z]>|\{\\an8\}) (<[a-z]>|\{\\an8\})', r'\1\2', line)
|
||
# Move hanging opening tags onto separate lines
|
||
line = re.sub(r'(<[a-z]>)\n', r'\n\1', line)
|
||
# Move hanging closing tags onto separate lines
|
||
line = re.sub(r'\n(</[a-z]>)', r'\1\n', line)
|
||
# Move spaces outside italic tags
|
||
line = re.sub(r'(<[a-z]>) ', r' \1', line)
|
||
line = re.sub(r' (</[a-z]>)', r'\1 ', line)
|
||
# Remove needless spaces inside italic tags
|
||
line = re.sub(r'^(<[a-z]>) ', r'\1', line)
|
||
# Fix "</tag>space<tag>"
|
||
line = re.sub(r'(?:</[a-z]>)(\s*)(?:<[a-z]>)', r'\1', line, flags=re.M)
|
||
# Remove empty tags
|
||
line = re.sub(r'<[a-z]>\s*</[a-z]>', r'', line)
|
||
# Move "{\an8}" to the rest of the text if it's on a new line
|
||
line = re.sub(r'({\\an8\})\n', r'\1', line)
|
||
|
||
# [REFORMATTING]
|
||
#
|
||
# Remove spaces inside brackets ("( TEXT )" -> "(TEXT)")
|
||
line = re.sub(r'\( (.*) \)', r'(\1)', line)
|
||
# Remove ">> " before text
|
||
line = re.sub(r'(^|\n)(</?[a-z]>|\{\\an8\})?>> ', r'\1\2', line)
|
||
# Remove lines consisting only of ">>"
|
||
line = re.sub(r'(^|\n)(</?[a-z]>|\{\\an8\})?>>($|\n)', r'', line)
|
||
# Replace any leftover <br> tags with a proper line break
|
||
line = re.sub(r'<br ?\/?>', '\n', line)
|
||
# Remove empty lines
|
||
line = re.sub(r'^\.?\s*$', '', line, flags=re.M)
|
||
line = re.sub(r'^-?\s*$', '', line, flags=re.M)
|
||
line = re.sub(r'^(</?i>|\{\\an8\})?\s*$', '', line, flags=re.M)
|
||
# Remove lines consisting only of a single character or digit
|
||
line = re.sub(r'^\[A-Za-z0-9]$', '', line)
|
||
# Adds missing spaces after "...", commas, and tags
|
||
line = re.sub(r'([a-z])(\.\.\.)([a-zA-Z][^.])', r'\1\2 \3', line)
|
||
line = re.sub(r'(</[a-z]>)(\w)', r'\1 \2', line)
|
||
line = re.sub(r'([a-z]),([a-zA-Z])', r'\1, \2', line)
|
||
line = re.sub(r',\n([a-z]+[\.\?])\s*$', r', \1', line)
|
||
# Correct front and end elypses
|
||
line = re.sub(
|
||
rf'({Regex.FRONT_OPTIONAL_TAGS_WITH_HYPHEN})' r'\.{1,}',
|
||
r'\1...',
|
||
line, flags=re.M
|
||
)
|
||
line = re.sub(r'\.{2,}' rf'({Regex.TAGS})?' r'\s*$', r'...\1', line, flags=re.M)
|
||
# Add space after frontal speaker hyphen
|
||
line = re.sub(r"^(<i>|\{\\an8\})?-+(?='?[\w\"\[\(\<\{\.\$♪¿¡])", r'\1- ', line, flags=re.M)
|
||
# Remove unnecessary space before "--"
|
||
line = re.sub(r'\s*--(\s*)', r'--\1', line, flags=re.M)
|
||
# Move notes inside tags (</i> ♪ -> </i>)
|
||
line = re.sub(r'(</[a-z]>)(\s*♪{1,})$', r'\2\1', line, flags=re.M)
|
||
# Remove trailing spaces
|
||
line = re.sub(r' +$', r'', line, flags=re.M).strip()
|
||
|
||
# [LINE SPLITS AND LINE BREAKS]
|
||
#
|
||
# Adds missing line splits (primarily present in Amazon subtitles)
|
||
line = re.sub(r'(.*)([^.][\]\)])([A-Z][^.])', r'\1\2\n\3', line)
|
||
line = re.sub(
|
||
r'(.*)([^\.\sA-Z][!\.;:?])(?<!(?:Mr|Ms)\.)(?<!Mrs\.)([A-Z][^.])',
|
||
r'- \1\2\n- \3',
|
||
line
|
||
)
|
||
# Fix weird linebreaks (caused by stripping SDH or not)
|
||
line = re.sub(r'(^<[a-z]>|\n<[a-z]>)(\w+)\n', r'\1\2 ', line)
|
||
# Add missing hyphens
|
||
line = re.sub(r'^\s*(?!-)(.*)\n- ([A-Z][a-z]+)$', r'- \1\n- \2', line)
|
||
# Remove linebreaks inside lines
|
||
line = re.sub(r'\r\n{1,}', r'\r\n', line).strip()
|
||
line = re.sub(r'\n{1,}', r'\n', line).strip()
|
||
# Remove duplicate spaces around italics
|
||
line = re.sub(r' +</i> +', r'</i> ', line).strip()
|
||
# Remove italics from hyphen, when content immediately following is not italics
|
||
line = re.sub(r'<i>-</i>([^<]+)', r'-\1', line).strip()
|
||
|
||
return line
|
||
|
||
for line in srt:
|
||
# Unescape html entities (twice, because yes, double encoding happens...)
|
||
for _ in range(2):
|
||
line.content = html.unescape(line.content)
|
||
|
||
# Run fix_line twice, as some of the fixes can introduce issues, e.g. double spaces
|
||
for _ in range(2):
|
||
line.content = _fix_line(line.content)
|
||
line.content = line.content.strip()
|
||
|
||
# Remove remaining linebreaks
|
||
line.content = line.content.strip('\n')
|
||
|
||
# Remove italics if every line is italicized, as this is almost certainly a mistake
|
||
# (using slices should be more performant than regex or startswith/endswith)
|
||
if len(srt) > 10 \
|
||
and all(line.content[:3] == '<i>' and line.content[-4:] == '</i>' for line in srt):
|
||
for line in srt:
|
||
line.content = line.content[3:-4]
|
||
|
||
combined = self._combine_timecodes(srt)
|
||
if self.remove_gaps:
|
||
return self._remove_gaps(combined)
|
||
|
||
return combined
|
||
|
||
def _combine_timecodes(self, srt: SubRipFile) -> SubRipFile:
|
||
"""Combines lines with timecodes and same content"""
|
||
subs_copy = SubRipFile([])
|
||
for line in srt:
|
||
if len(subs_copy) == 0:
|
||
subs_copy.append(line)
|
||
continue
|
||
if line_duration(subs_copy[-1]) == line_duration(line) \
|
||
and subs_copy[-1].start == line.start \
|
||
and subs_copy[-1].end == line.end:
|
||
if subs_copy[-1].content != line.content:
|
||
subs_copy[-1].content += '\n' + line.content
|
||
# Merge lines with the same text within 10 ms
|
||
elif self._subtract_ts(line.start, subs_copy[-1].end) < 10 \
|
||
and line.content == subs_copy[-1].content:
|
||
subs_copy[-1].end = line.end
|
||
# Merge lines with less than 2 frames of gap and same text
|
||
# to avoid duplicating lines as we remove gaps later
|
||
elif 0 < self._subtract_ts(line.start, subs_copy[-1].end) <= 85 \
|
||
and line.content.startswith(subs_copy[-1].content) \
|
||
and self.remove_gaps:
|
||
subs_copy[-1].end = line.end
|
||
subs_copy[-1].content = line.content
|
||
# Fix overlapping times
|
||
elif self._subtract_ts(line.start, subs_copy[-1].end) == 0:
|
||
subs_copy[-1].end -= timedelta(milliseconds=1)
|
||
subs_copy.append(line)
|
||
elif line.content.strip():
|
||
subs_copy.append(line)
|
||
|
||
subs_copy = subs_copy or srt
|
||
subs_copy.clean_indexes()
|
||
return subs_copy
|
||
|
||
def _remove_gaps(self, srt: SubRipFile) -> SubRipFile:
|
||
"""Remove short gaps between lines"""
|
||
subs_copy = SubRipFile([])
|
||
for line in srt:
|
||
if len(subs_copy) == 0:
|
||
subs_copy.append(line)
|
||
continue
|
||
# Remove 2-frame or smaller gaps (2 frames/83ms@24 is Netflix standard)
|
||
elif 1 < self._subtract_ts(line.start, subs_copy[-1].end) <= 85:
|
||
line.start = subs_copy[-1].end
|
||
subs_copy[-1].end -= timedelta(milliseconds=1)
|
||
subs_copy.append(line)
|
||
elif line.content.strip():
|
||
subs_copy.append(line)
|
||
|
||
subs_copy = subs_copy or srt
|
||
subs_copy.clean_indexes()
|
||
return subs_copy
|
||
|
||
@staticmethod
|
||
def _fix_time_codes(srt: SubRipFile) -> SubRipFile:
|
||
"""Fixes timecodes over 23:59, often present in live content"""
|
||
offset = 0
|
||
for line in srt:
|
||
hours, _ = divmod(line.start.seconds, 3600)
|
||
hours += line.start.days * 24
|
||
|
||
if not offset and hours > 23:
|
||
offset = hours
|
||
if offset:
|
||
line.start -= datetime.timedelta(hours=offset)
|
||
line.end -= datetime.timedelta(hours=offset)
|
||
return srt
|
||
|
||
@staticmethod
|
||
def _subtract_ts(ts1: datetime.timedelta, ts2: datetime.timedelta) -> int:
|
||
"""Subtracts two timestamps and returns a difference as int of miliseconds"""
|
||
return round((ts1 - ts2).total_seconds() * 1000)
|