VT-PR/scripts/subby/subby/processors/common_issues.py
2025-04-14 08:39:57 +05:30

279 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import copy
import datetime
import html
import re
import unicodedata
from datetime import timedelta
import langcodes
from subby import regex as Regex
from subby.processors.base import BaseProcessor
from subby.processors.rtl import RTL_LANGUAGES, RTLFixer
from subby.subripfile import SubRipFile
from subby.utils.time import line_duration
class CommonIssuesFixer(BaseProcessor):
"""Processor fixing common issues found in subtitles"""
remove_gaps = True
def process(self, srt, language=None):
fixed = self._fix_time_codes(copy.deepcopy(srt))
corrected = self._correct_subtitles(fixed)
if language and langcodes.get(language).language in RTL_LANGUAGES:
corrected, _ = RTLFixer().process(corrected, language=language)
return corrected, corrected != srt
def _correct_subtitles(self, srt: SubRipFile) -> SubRipFile:
def _fix_line(line):
# [GENERAL] - Affects other regexes
# Remove more than one space
line = re.sub(r' {2,}', ' ', line)
# Correct lines starting with space
line = re.sub(r'^\s*', '', line)
line = re.sub(r'\n\s*', '\n', line)
#
# [ENCODING FIXES, CHARACTER REPLACEMENTS]
# Fix musical notes garbled by encoding
# has to happen before normalization as that replaces the TM char
line = line.replace(r'♪', '')
# Normalize unicode characters
line = unicodedata.normalize('NFKC', line)
# Replace short hyphen with regular size
line = line.replace(r'', r'-')
# Replace double note with single note
line = line.replace(r'', r'')
# Replace hashes, asterisks at the start of a line with a musical note
line = re.sub(
r'^((?:{\\an8})?(?:<i>)?)(- ?)?[#\*]{1,}(?=\s+)',
r'\1\2♪',
line,
flags=re.M
)
# Replace hashes, asterisks at the end of a line with a musical note
line = re.sub(
r'(?<=\s)(?<![#\*])(?:[#\*]{1,3}|[#\*]{1,3})(?![0-9A-Z])(</i>$|$)',
r'\1',
line,
flags=re.M
)
line = re.sub(r'^[#\*]+$', r'', line, flags=re.M)
# Move notes into italics, if rest of the line is
line = re.sub(r'♪ <i>(.*)', r'<i>♪ \1', line)
line = re.sub(r'(♪.*)</i>\s*♪', r'\1 ♪</i>', line)
# Replace some pound signs with notes (Binge...)
# (Matches only start/end of a line with a space
# to avoid false positives)
line = re.sub(r'', r'', line)
line = re.sub(r' £$', r'', line)
# Duplicated notes
line = re.sub(r'{1,}', r'', line)
# Add spaces between notes and text
line = re.sub(r'^♪([A-Za-z])', r'\1', line)
line = re.sub(r'([A-Za-z])♪', r'\1 ♪', line)
# Replace \h (non-breaking space in ASS) with a regular space
# (result of ffmpeg extraction of mp4-embedded subtitles)
line = re.sub(r'(\\h)+', ' ', line).strip()
# Fix leftover amps (html unescape fixes those, but not when they're duped)
line = re.sub(r'&(amp;){1,}', r'&', line)
# Fix "it'`s" -> "it's"
line = re.sub(r"'[`]", r"'", line)
# [TAG STRIPPING AND CORRECTING]
#
# Replace ASS positioning tags with top only
line = re.sub(r'(\{\\an[0-9]\}){1,}', r'{\\an8}', line)
# Remove space after ASS positioning tags
line = re.sub(r'(\{\\an[0-9]\}) +(?=[A-Za-z-])', r'{\\an8}', line)
# Fix hanging tags
line = re.sub(r'^(<[a-z]>)\n', r'\1', line)
line = re.sub(r'</([a-z])>$\n<([a-z])>', r'\n', line, flags=re.M)
# Remove duplicated tags
line = re.sub(r'(<[a-z]>){1,}', r'\1', line)
line = re.sub(r'(</[a-z]>){1,}', r'\1', line)
# Remove an unnecessary space after italic tag open
line = re.sub(r'^(<[a-z]>) {1,}', r'\1', line)
line = re.sub(r'^ {1,}', '', line)
# Remove non-italic tags
line = re.sub(r'</?(?!i>)[a-z]+>', '', line)
# Remove spaces between tags
line = re.sub(r'(<[a-z]>|\{\\an8\}) (<[a-z]>|\{\\an8\})', r'\1\2', line)
# Move hanging opening tags onto separate lines
line = re.sub(r'(<[a-z]>)\n', r'\n\1', line)
# Move hanging closing tags onto separate lines
line = re.sub(r'\n(</[a-z]>)', r'\1\n', line)
# Move spaces outside italic tags
line = re.sub(r'(<[a-z]>) ', r' \1', line)
line = re.sub(r' (</[a-z]>)', r'\1 ', line)
# Remove needless spaces inside italic tags
line = re.sub(r'^(<[a-z]>) ', r'\1', line)
# Fix "</tag>space<tag>"
line = re.sub(r'(?:</[a-z]>)(\s*)(?:<[a-z]>)', r'\1', line, flags=re.M)
# Remove empty tags
line = re.sub(r'<[a-z]>\s*</[a-z]>', r'', line)
# Move "{\an8}" to the rest of the text if it's on a new line
line = re.sub(r'({\\an8\})\n', r'\1', line)
# [REFORMATTING]
#
# Remove spaces inside brackets ("( TEXT )" -> "(TEXT)")
line = re.sub(r'\( (.*) \)', r'(\1)', line)
# Remove ">> " before text
line = re.sub(r'(^|\n)(</?[a-z]>|\{\\an8\})?>> ', r'\1\2', line)
# Remove lines consisting only of ">>"
line = re.sub(r'(^|\n)(</?[a-z]>|\{\\an8\})?>>($|\n)', r'', line)
# Replace any leftover <br> tags with a proper line break
line = re.sub(r'<br ?\/?>', '\n', line)
# Remove empty lines
line = re.sub(r'^\.?\s*$', '', line, flags=re.M)
line = re.sub(r'^-?\s*$', '', line, flags=re.M)
line = re.sub(r'^(</?i>|\{\\an8\})?\s*$', '', line, flags=re.M)
# Remove lines consisting only of a single character or digit
line = re.sub(r'^\[A-Za-z0-9]$', '', line)
# Adds missing spaces after "...", commas, and tags
line = re.sub(r'([a-z])(\.\.\.)([a-zA-Z][^.])', r'\1\2 \3', line)
line = re.sub(r'(</[a-z]>)(\w)', r'\1 \2', line)
line = re.sub(r'([a-z]),([a-zA-Z])', r'\1, \2', line)
line = re.sub(r',\n([a-z]+[\.\?])\s*$', r', \1', line)
# Correct front and end elypses
line = re.sub(
rf'({Regex.FRONT_OPTIONAL_TAGS_WITH_HYPHEN})' r'\.{1,}',
r'\1...',
line, flags=re.M
)
line = re.sub(r'\.{2,}' rf'({Regex.TAGS})?' r'\s*$', r'...\1', line, flags=re.M)
# Add space after frontal speaker hyphen
line = re.sub(r"^(<i>|\{\\an8\})?-+(?='?[\w\"\[\(\<\{\.\$♪¿¡])", r'\1- ', line, flags=re.M)
# Remove unnecessary space before "--"
line = re.sub(r'\s*--(\s*)', r'--\1', line, flags=re.M)
# Move notes inside tags (</i> ♪ -> </i>)
line = re.sub(r'(</[a-z]>)(\s*♪{1,})$', r'\2\1', line, flags=re.M)
# Remove trailing spaces
line = re.sub(r' +$', r'', line, flags=re.M).strip()
# [LINE SPLITS AND LINE BREAKS]
#
# Adds missing line splits (primarily present in Amazon subtitles)
line = re.sub(r'(.*)([^.][\]\)])([A-Z][^.])', r'\1\2\n\3', line)
line = re.sub(
r'(.*)([^\.\sA-Z][!\.;:?])(?<!(?:Mr|Ms)\.)(?<!Mrs\.)([A-Z][^.])',
r'- \1\2\n- \3',
line
)
# Fix weird linebreaks (caused by stripping SDH or not)
line = re.sub(r'(^<[a-z]>|\n<[a-z]>)(\w+)\n', r'\1\2 ', line)
# Add missing hyphens
line = re.sub(r'^\s*(?!-)(.*)\n- ([A-Z][a-z]+)$', r'- \1\n- \2', line)
# Remove linebreaks inside lines
line = re.sub(r'\r\n{1,}', r'\r\n', line).strip()
line = re.sub(r'\n{1,}', r'\n', line).strip()
# Remove duplicate spaces around italics
line = re.sub(r' +</i> +', r'</i> ', line).strip()
# Remove italics from hyphen, when content immediately following is not italics
line = re.sub(r'<i>-</i>([^<]+)', r'-\1', line).strip()
return line
for line in srt:
# Unescape html entities (twice, because yes, double encoding happens...)
for _ in range(2):
line.content = html.unescape(line.content)
# Run fix_line twice, as some of the fixes can introduce issues, e.g. double spaces
for _ in range(2):
line.content = _fix_line(line.content)
line.content = line.content.strip()
# Remove remaining linebreaks
line.content = line.content.strip('\n')
# Remove italics if every line is italicized, as this is almost certainly a mistake
# (using slices should be more performant than regex or startswith/endswith)
if len(srt) > 10 \
and all(line.content[:3] == '<i>' and line.content[-4:] == '</i>' for line in srt):
for line in srt:
line.content = line.content[3:-4]
combined = self._combine_timecodes(srt)
if self.remove_gaps:
return self._remove_gaps(combined)
return combined
def _combine_timecodes(self, srt: SubRipFile) -> SubRipFile:
"""Combines lines with timecodes and same content"""
subs_copy = SubRipFile([])
for line in srt:
if len(subs_copy) == 0:
subs_copy.append(line)
continue
if line_duration(subs_copy[-1]) == line_duration(line) \
and subs_copy[-1].start == line.start \
and subs_copy[-1].end == line.end:
if subs_copy[-1].content != line.content:
subs_copy[-1].content += '\n' + line.content
# Merge lines with the same text within 10 ms
elif self._subtract_ts(line.start, subs_copy[-1].end) < 10 \
and line.content == subs_copy[-1].content:
subs_copy[-1].end = line.end
# Merge lines with less than 2 frames of gap and same text
# to avoid duplicating lines as we remove gaps later
elif 0 < self._subtract_ts(line.start, subs_copy[-1].end) <= 85 \
and line.content.startswith(subs_copy[-1].content) \
and self.remove_gaps:
subs_copy[-1].end = line.end
subs_copy[-1].content = line.content
# Fix overlapping times
elif self._subtract_ts(line.start, subs_copy[-1].end) == 0:
subs_copy[-1].end -= timedelta(milliseconds=1)
subs_copy.append(line)
elif line.content.strip():
subs_copy.append(line)
subs_copy = subs_copy or srt
subs_copy.clean_indexes()
return subs_copy
def _remove_gaps(self, srt: SubRipFile) -> SubRipFile:
"""Remove short gaps between lines"""
subs_copy = SubRipFile([])
for line in srt:
if len(subs_copy) == 0:
subs_copy.append(line)
continue
# Remove 2-frame or smaller gaps (2 frames/83ms@24 is Netflix standard)
elif 1 < self._subtract_ts(line.start, subs_copy[-1].end) <= 85:
line.start = subs_copy[-1].end
subs_copy[-1].end -= timedelta(milliseconds=1)
subs_copy.append(line)
elif line.content.strip():
subs_copy.append(line)
subs_copy = subs_copy or srt
subs_copy.clean_indexes()
return subs_copy
@staticmethod
def _fix_time_codes(srt: SubRipFile) -> SubRipFile:
"""Fixes timecodes over 23:59, often present in live content"""
offset = 0
for line in srt:
hours, _ = divmod(line.start.seconds, 3600)
hours += line.start.days * 24
if not offset and hours > 23:
offset = hours
if offset:
line.start -= datetime.timedelta(hours=offset)
line.end -= datetime.timedelta(hours=offset)
return srt
@staticmethod
def _subtract_ts(ts1: datetime.timedelta, ts2: datetime.timedelta) -> int:
"""Subtracts two timestamps and returns a difference as int of miliseconds"""
return round((ts1 - ts2).total_seconds() * 1000)