|
import math |
|
import os |
|
import time |
|
import json |
|
import re |
|
import xml.etree.ElementTree as ElementTree |
|
from html import unescape |
|
from typing import Dict, Optional |
|
|
|
from pytubefix import request |
|
from pytubefix.helpers import safe_filename, target_directory |
|
|
|
|
|
class Caption: |
|
"""Container for caption tracks.""" |
|
|
|
def __init__(self, caption_track: Dict): |
|
"""Construct a :class:`Caption <Caption>`. |
|
|
|
:param dict caption_track: |
|
Caption track data extracted from ``watch_html``. |
|
""" |
|
self.url = caption_track.get("baseUrl") |
|
|
|
|
|
|
|
name_dict = caption_track['name'] |
|
if 'simpleText' in name_dict: |
|
self.name = name_dict['simpleText'] |
|
else: |
|
for el in name_dict['runs']: |
|
if 'text' in el: |
|
self.name = el['text'] |
|
|
|
|
|
self.code = caption_track["vssId"] |
|
|
|
|
|
|
|
self.code = self.code.strip('.') |
|
|
|
@property |
|
def xml_captions(self) -> str: |
|
"""Download the xml caption tracks.""" |
|
return request.get(self.url) |
|
|
|
@property |
|
def json_captions(self) -> dict: |
|
"""Download and parse the json caption tracks.""" |
|
if 'ftm=' in self.url: |
|
json_captions_url = self.url.replace('fmt=srv3', 'fmt=json3') |
|
else: |
|
json_captions_url = f'{self.url}&fmt=json3' |
|
text = request.get(json_captions_url) |
|
parsed = json.loads(text) |
|
assert parsed['wireMagic'] == 'pb3', 'Unexpected captions format' |
|
return parsed |
|
|
|
def generate_srt_captions(self) -> str: |
|
"""Generate "SubRip Subtitle" captions. |
|
|
|
Takes the xml captions from :meth:`~pytube.Caption.xml_captions` and |
|
recompiles them into the "SubRip Subtitle" format. |
|
""" |
|
return self.xml_caption_to_srt(self.xml_captions) |
|
|
|
def generate_txt_captions(self) -> str: |
|
"""Generate Text captions. |
|
|
|
Takes the "SubRip Subtitle" format captions and converts them into text |
|
""" |
|
srt_captions = self.generate_srt_captions() |
|
lines = srt_captions.splitlines() |
|
text = '' |
|
for line in lines: |
|
if re.search('^[0-9]+$', line) is None and \ |
|
re.search('^[0-9]{2}:[0-9]{2}:[0-9]{2}', line) is None and \ |
|
re.search('^$', line) is None: |
|
text += ' ' + line.strip() |
|
text = text.lstrip() |
|
return text.strip() |
|
|
|
def save_captions(self, filename: str): |
|
"""Generate and save "SubRip Subtitle" captions to a text file. |
|
|
|
Takes the xml captions from :meth:`~pytubefix.Caption.xml_captions` and |
|
recompiles them into the "SubRip Subtitle" format and saves it to a text file. |
|
|
|
:param filename: The name of the file to save the captions. |
|
""" |
|
srt_captions = self.xml_caption_to_srt(self.xml_captions) |
|
|
|
with open(filename, 'w', encoding='utf-8') as file: |
|
file.write(srt_captions) |
|
|
|
@staticmethod |
|
def float_to_srt_time_format(d: float) -> str: |
|
"""Convert decimal durations into proper srt format. |
|
|
|
:rtype: str |
|
:returns: |
|
SubRip Subtitle (str) formatted time duration. |
|
|
|
float_to_srt_time_format(3.89) -> '00:00:03,890' |
|
""" |
|
fraction, whole = math.modf(d) |
|
time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole)) |
|
ms = f"{fraction:.3f}".replace("0.", "") |
|
return time_fmt + ms |
|
|
|
def xml_caption_to_srt(self, xml_captions: str) -> str: |
|
"""Convert xml caption tracks to "SubRip Subtitle (srt)". |
|
|
|
:param str xml_captions: |
|
XML formatted caption tracks. |
|
""" |
|
segments = [] |
|
root = ElementTree.fromstring(xml_captions) |
|
|
|
i = 0 |
|
for child in list(root.iter(root.tag))[0]: |
|
if child.tag in ['p', 'text']: |
|
caption = '' |
|
|
|
|
|
if not list(child): |
|
|
|
caption = child.text |
|
for s in list(child): |
|
if s.tag == 's': |
|
caption += f' {s.text}' |
|
if not caption: |
|
continue |
|
caption = unescape(caption.replace("\n", " ").replace(" ", " "),) |
|
try: |
|
if "d" in child.attrib: |
|
duration = float(child.attrib["d"]) / 1000.0 |
|
else: |
|
duration = float(child.attrib["dur"]) |
|
except KeyError: |
|
duration = 0.0 |
|
|
|
if "t" in child.attrib: |
|
start = float(child.attrib["t"]) / 1000.0 |
|
else: |
|
start = float(child.attrib["start"]) |
|
|
|
end = start + duration |
|
sequence_number = i + 1 |
|
line = "{seq}\n{start} --> {end}\n{text}\n".format( |
|
seq=sequence_number, |
|
start=self.float_to_srt_time_format(start), |
|
end=self.float_to_srt_time_format(end), |
|
text=caption, |
|
) |
|
segments.append(line) |
|
i += 1 |
|
return "\n".join(segments).strip() |
|
|
|
def download( |
|
self, |
|
title: str, |
|
srt: bool = True, |
|
output_path: Optional[str] = None, |
|
filename_prefix: Optional[str] = None, |
|
) -> str: |
|
"""Write the media stream to disk. |
|
|
|
:param title: |
|
Output filename (stem only) for writing media file. |
|
If one is not specified, the default filename is used. |
|
:type title: str |
|
:param srt: |
|
Set to True to download srt, false to download xml. Defaults to True. |
|
:type srt bool |
|
:param output_path: |
|
(optional) Output path for writing media file. If one is not |
|
specified, defaults to the current working directory. |
|
:type output_path: str or None |
|
:param filename_prefix: |
|
(optional) A string that will be prepended to the filename. |
|
For example a number in a playlist or the name of a series. |
|
If one is not specified, nothing will be prepended |
|
This is separate from filename so you can use the default |
|
filename but still add a prefix. |
|
:type filename_prefix: str or None |
|
|
|
:rtype: str |
|
""" |
|
if title.endswith(".srt") or title.endswith(".xml"): |
|
filename = ".".join(title.split(".")[:-1]) |
|
else: |
|
filename = title |
|
|
|
if filename_prefix: |
|
filename = f"{safe_filename(filename_prefix)}{filename}" |
|
|
|
filename = safe_filename(filename) |
|
|
|
filename += f" ({self.code})" |
|
filename += ".srt" if srt else ".xml" |
|
|
|
file_path = os.path.join(target_directory(output_path), filename) |
|
|
|
with open(file_path, "w", encoding="utf-8") as file_handle: |
|
if srt: |
|
file_handle.write(self.generate_srt_captions()) |
|
else: |
|
file_handle.write(self.xml_captions) |
|
|
|
return file_path |
|
|
|
def __repr__(self): |
|
"""Printable object representation.""" |
|
return '<Caption lang="{s.name}" code="{s.code}">'.format(s=self) |
|
|