Spaces:
Running
Running
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
import aiofiles | |
import asyncio | |
from datetime import datetime | |
import httpx | |
import json | |
import logging | |
from pathlib import Path | |
import random | |
import re | |
import time | |
from typing import Dict, List, Set | |
from project_settings import project_path | |
logger = logging.getLogger("toolbox") | |
DEFAULT_HEADERS ={ | |
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36", | |
"referer": "https://www.douyin.com/", | |
"cookie": "enter_pc_once=1; UIFID_TEMP=163eead721bc91ca6f3a3cb4766a73c0638fabced1012e02c28bcbf3169aca5c131be8508d5b2900b15c77026693ad016706daf3577246266fd7a479e2c70f0a3968ec59059e59b609708db349a17ec8; s_v_web_id=verify_mcq56tpq_wqZcsYMx_0lCj_4d3j_BTUO_TghtqFe3Awt5; hevc_supported=true; dy_swidth=1920; dy_sheight=1080; fpk1=U2FsdGVkX1+8cnptVaRUMZ4YRHZbX6C5V2sPGH4oztEfPUzJREBsPmoxqj8pFRNQCR/0dxJNHjjYKsV5DfjwLQ==; fpk2=5b2ba492da1bf8b88f5f71b161575820; passport_csrf_token=6e0bdaf8e3f77ddcf5e175a23ca3de43; passport_csrf_token_default=6e0bdaf8e3f77ddcf5e175a23ca3de43; __security_mc_1_s_sdk_crypt_sdk=e70bb4ad-40c3-a290; __security_mc_1_s_sdk_cert_key=b5792a4e-49a7-bd77; bd_ticket_guard_client_web_domain=2; d_ticket=585e449b8f1d99eacf78acdbe38c3c8e66074; n_mh=O0fKzxpW1JwZ-FhQ8TWI4BT0LewuwA38XumyGMbx894; uid_tt=e7a6d987a86a99e030cc79cb8602e990; uid_tt_ss=e7a6d987a86a99e030cc79cb8602e990; sid_tt=166574208279efc3c59c031c7130b41b; sessionid=166574208279efc3c59c031c7130b41b; sessionid_ss=166574208279efc3c59c031c7130b41b; session_tlb_tag=sttt%7C4%7CFmV0IIJ578PFnAMccTC0G_________-2-nDYOve9XfDYJfKlJaro-JQs3ftBRh-9gKAfVWv2e14%3D; is_staff_user=false; is_dash_user=1; __security_server_data_status=1; UIFID=163eead721bc91ca6f3a3cb4766a73c0638fabced1012e02c28bcbf3169aca5c131be8508d5b2900b15c77026693ad01e5a8d558171969e1966551b912cae503cba9c9e66be1ed40f28f8ddf18c9dc24d88b9fb2d777753990d5853c74777955fc3652eb8ab9f2ee67c3ffc8d0f340bcc77115073a26843525cad007257ad788d901915c7a4f2bac8d0c20d10ae4339c08bcf2750e19f7c92f4ef02baa9a856d; my_rd=2; xgplayer_device_id=77043468800; xgplayer_user_id=952792079155; live_use_vvc=%22false%22; passport_mfa_token=CjWll%2F%2FHDAp0ax6akf%2BMNx4U65LGPmGZOQVcnyy59TZStqgOayVGd5uTdnOWNGZU%2B0YBkUhP0RpKCjwAAAAAAAAAAAAAT019rQ8a6REnMySCyTAAVhFra5ZCBhIG6yI6xbSyb96Mym9ZOWNkBz7hhDVGAW9Eq54Qi6z4DRj2sdFsIAIiAQMylHAB; passport_assist_user=Cjyfe4luw56SFCDA0i3L6-360Pj1rJCvLzJZ_XyWevQdgZcD7FJ9_aS9V6529MzTTX2CfuhilG9ONfIJ_9caSgo8AAAAAAAAAAAAAE9NvuwN8mN1pZJGcGcI0_hTUSO2skqwqXjkSNurZy3sBYHtRHLak13ML4-WwcTIpMiCEIus-A0Yia_WVCABIgEDwMHL6g%3D%3D; passport_auth_status=1afdadf16f0a1b1f9e242a63bad2a4ad%2Cb719c4bdc913fd296c195ace9b4b5573; passport_auth_status_ss=1afdadf16f0a1b1f9e242a63bad2a4ad%2Cb719c4bdc913fd296c195ace9b4b5573; sid_guard=166574208279efc3c59c031c7130b41b%7C1754101144%7C5184000%7CWed%2C+01-Oct-2025+02%3A19%3A04+GMT; sid_ucp_v1=1.0.0-KDBiYmNmMDA5YmQyYjQ3OTNjYjc2NDRmMWY5ZWZiZmYxZTA2NGVlYWEKHwi8v5DIqQIQmOu1xAYY7zEgDDC3y7LRBTgCQPEHSAQaAmhsIiAxNjY1NzQyMDgyNzllZmMzYzU5YzAzMWM3MTMwYjQxYg; ssid_ucp_v1=1.0.0-KDBiYmNmMDA5YmQyYjQ3OTNjYjc2NDRmMWY5ZWZiZmYxZTA2NGVlYWEKHwi8v5DIqQIQmOu1xAYY7zEgDDC3y7LRBTgCQPEHSAQaAmhsIiAxNjY1NzQyMDgyNzllZmMzYzU5YzAzMWM3MTMwYjQxYg; login_time=1754101144621; _bd_ticket_crypt_cookie=9f1aa50dacbba771e10f850990ecc56c; __security_mc_1_s_sdk_sign_data_key_web_protect=853b5c52-4170-98c8; SelfTabRedDotControl=%5B%7B%22id%22%3A%227510617703292667931%22%2C%22u%22%3A11%2C%22c%22%3A0%7D%2C%7B%22id%22%3A%227076981218884978725%22%2C%22u%22%3A511%2C%22c%22%3A0%7D%5D; __druidClientInfo=JTdCJTIyY2xpZW50V2lkdGglMjIlM0EyOTglMkMlMjJjbGllbnRIZWlnaHQlMjIlM0E1MzglMkMlMjJ3aWR0aCUyMiUzQTI5OCUyQyUyMmhlaWdodCUyMiUzQTUzOCUyQyUyMmRldmljZVBpeGVsUmF0aW8lMjIlM0ExJTJDJTIydXNlckFnZW50JTIyJTNBJTIyTW96aWxsYSUyRjUuMCUyMChNYWNpbnRvc2glM0IlMjBJbnRlbCUyME1hYyUyME9TJTIwWCUyMDEwXzE1XzcpJTIwQXBwbGVXZWJLaXQlMkY1MzcuMzYlMjAoS0hUTUwlMkMlMjBsaWtlJTIwR2Vja28pJTIwQ2hyb21lJTJGMTM4LjAuMC4wJTIwU2FmYXJpJTJGNTM3LjM2JTIyJTdE; volume_info=%7B%22volume%22%3A0.778%2C%22isMute%22%3Afalse%2C%22isUserMute%22%3Afalse%7D; publish_badge_show_info=%220%2C0%2C0%2C1754911178612%22; __live_version__=%221.1.3.7143%22; live_can_add_dy_2_desktop=%221%22; download_guide=%220%2F%2F1%22; WallpaperGuide=%7B%22showTime%22%3A0%2C%22closeTime%22%3A0%2C%22showCount%22%3A0%2C%22cursor1%22%3A160%2C%22cursor2%22%3A50%2C%22hoverTime%22%3A1752156303322%7D; strategyABtestKey=%221755213526.48%22; __ac_nonce=0689f18390096fdb3ee8f; __ac_signature=_02B4Z6wo00f01zYpLFgAAIDBBbzaGE6bBP82CSjAAKUf26; douyin.com; xg_device_score=7.547739235768296; device_web_cpu_core=8; device_web_memory_size=8; gulu_source_res=eyJwX2luIjoiYmY5ZDgyN2ZlMmQ3ZWYyOGU3ZjJmZGI4NTdhYTAxZGNlMjNlMTViMmRkZDM5ODM0MzJiMzE3NjA2OGU3OTEyNiJ9; stream_player_status_params=%22%7B%5C%22is_auto_play%5C%22%3A0%2C%5C%22is_full_screen%5C%22%3A0%2C%5C%22is_full_webscreen%5C%22%3A1%2C%5C%22is_mute%5C%22%3A0%2C%5C%22is_speed%5C%22%3A1%2C%5C%22is_visible%5C%22%3A0%7D%22; FOLLOW_LIVE_POINT_INFO=%22MS4wLjABAAAARchEEl9h74Ky-zMgdyLxIzDgZlhHOO3KB-V5h_nhI_Q%2F1755273600000%2F0%2F0%2F1755257556086%22; FOLLOW_NUMBER_YELLOW_POINT_INFO=%22MS4wLjABAAAARchEEl9h74Ky-zMgdyLxIzDgZlhHOO3KB-V5h_nhI_Q%2F1755273600000%2F0%2F1755256956086%2F0%22; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCRVFyUndCSkxSakF2K3g4SzQwZXFoWlNpUksrT3NGTEJIQzh1dWtMUzE2RXRCWThMbExkejBUY3RsL2c4SXdpYldTbThsZFZoaWIzUEs0OXArb0dJSlE9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoyfQ%3D%3D; ttwid=1%7CN05Iw2OUzw_zu1lcgD2Nnt7DlhY3U6BnjTZUnGZMlj4%7C1755256961%7C86396afa0f78587ebc26f398c5c8335f390cfbd081c341af6a4325fd36f3c860; odin_tt=67e15087730b08cd9ce726980428ef07821c09faef000e0a5a94bebdaac917118c8a3761bf34d1eb4458a399f7ead945; biz_trace_id=84799947; sdk_source_info=7e276470716a68645a606960273f276364697660272927676c715a6d6069756077273f276364697660272927666d776a68605a607d71606b766c6a6b5a7666776c7571273f275e58272927666a6b766a69605a696c6061273f27636469766027292762696a6764695a7364776c6467696076273f275e582729277672715a646971273f2763646976602729277f6b5a666475273f2763646976602729276d6a6e5a6b6a716c273f2763646976602729276c6b6f5a7f6367273f27636469766027292771273f27323036373d3632303730303234272927676c715a75776a716a666a69273f2763646976602778; bit_env=ZaZWWZ1txljN5I_WfKkXqlt1dYQaaJjlDN4lVYH-oVy4OD8ZWpqdiRqqCQeQPoVaO0sQ2QloHyGtrbR1AIBn0Iea2Xn0gLqYrQsGvNZXUv8LTupNgCFq_RRIDpiB5GtvIag0RdQ0xASNwhpHHP7v9HttY2RjnX5mAHVMsL9I5Yg_0FXr22MftQcGS1G15ixTe8sEX2u-97oyTaGL7FJ6dYVtbTajxl74CePYTw6-U2VWweQ772lHZJ5wpkxlt46xAzWB8dbVThrgwuDNx0TCnuHY9tWRFJAV67pbbO48__c3dqTjfqFfVlCaaQwLEJ9O7vKXYpmn9Jp2Q_nMKJRlT3xEP_QgbZyzc5TUHaktZMuECUU1PqOGy-zpOvPQSUQKCW5RQVeklOubwfrDHN4ppYc9oMXdJc8TjJ7-9kQ14cG6rPXRcyQZaJdIQnlPF_59odjueeeb4k14N8VZ4mvu11pc3ntm426tVEw52-R21YEfDjWAxStsmfYRJ8yQKda2cLmvfvyGv6FRtc7ykBJGBdevuvj3Ng3vCynGIHNOQ4g%3D; passport_auth_mix_state=4zlyzmvyoew0ppa2qqsnuog7k19i3msu; IsDouyinActive=true; home_can_add_dy_2_desktop=%220%22; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1920%2C%5C%22screen_height%5C%22%3A1080%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A8%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A6.25%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A100%7D%22" | |
} | |
class DouyinDownloader(object): | |
def __init__(self, | |
platform: str, | |
user_name: str, | |
sec_user_id: str, | |
min_date: str = None, | |
headers: dict = None, | |
cookies: str = None, | |
check_interval: int = 10*60, | |
file_output_fmt: str = "mp4", | |
file_output_dir: str = "output", | |
file_info_file: str = "file_info.json", | |
): | |
self.platform = platform | |
self.user_name = user_name | |
self.sec_user_id = sec_user_id | |
self.min_date = min_date or time.strftime("%Y-%m-%d %H:%M:%S") | |
self.headers = headers or DEFAULT_HEADERS | |
self.cookies = cookies | |
self.check_interval = check_interval | |
self.file_output_fmt = file_output_fmt | |
self.file_output_dir: Path = Path(file_output_dir) | |
self.file_info_file: Path = Path(file_info_file) | |
self.flag = f"[{self.__class__.__name__}][{self.platform}][{self.user_name}]" | |
# state | |
self.run_recording = False | |
self.downloaded_video_aweme_id_set = None | |
self.downloaded_video_title_set = None | |
# client | |
self.client: httpx.AsyncClient = self.get_client() | |
def get_client(self) -> httpx.AsyncClient: | |
client = httpx.AsyncClient( | |
http2=True, | |
timeout=self.check_interval, | |
limits=httpx.Limits(max_keepalive_connections=100, keepalive_expiry=self.check_interval * 2), | |
headers=self.headers, | |
cookies=self.cookies, | |
) | |
return client | |
async def request(self, method: str, url: str, **kwargs): | |
try: | |
response = await self.client.request(method, url, **kwargs) | |
return response | |
except httpx.ReadError as e: | |
raise ConnectionError(f"request failed; error type: {type(e)}, error text: {str(e)}, url: {url}") | |
except httpx.ConnectError as e: | |
raise ConnectionError(f"request failed; error type: {type(e)}, error text: {str(e)}, url: {url}") | |
except httpx.RemoteProtocolError as e: | |
raise ConnectionError(f"request failed; error type: {type(e)}, error text: {str(e)}, url: {url}") | |
async def save_downloaded_video_info(self, create_time: str, filename: Path, url: str, video_id: str, title: str, desc: str, tags: List[str]) -> str: | |
video_info = await self.load_downloaded_video_info() | |
video_info[video_id] = { | |
"create_time": create_time, | |
"filename": filename.as_posix(), | |
"title": title, | |
"desc": desc, | |
"tags": tags, | |
"url": url, | |
"video_id": video_id, | |
} | |
async with aiofiles.open(self.file_info_file.as_posix(), "w", encoding="utf-8") as f: | |
video_info_ = json.dumps(video_info, ensure_ascii=False, indent=2) | |
await f.write(f"{video_info_}\n") | |
# json.dump(video_info, f, ensure_ascii=False, indent=2) | |
return self.file_info_file.as_posix() | |
async def load_downloaded_video_info(self) -> Dict[str, dict]: | |
video_info = dict() | |
if self.file_info_file.exists(): | |
async with aiofiles.open(self.file_info_file.as_posix(), "r", encoding="utf-8") as f: | |
data = await f.read() | |
video_info: dict = json.loads(data) | |
return video_info | |
async def get_downloaded_video_aweme_id_set(self) -> Set[str]: | |
video_info_dict: dict = await self.load_downloaded_video_info() | |
aweme_id_set = set(video_info_dict.keys()) | |
return aweme_id_set | |
async def get_downloaded_video_title_set(self) -> Set[str]: | |
video_info_dict: dict = await self.load_downloaded_video_info() | |
title_set = set() | |
for k, v in video_info_dict.items(): | |
title = v["title"] | |
title_set.add(title) | |
return title_set | |
async def download_video_by_url(self, filename: Path, url: str): | |
filename = Path(filename) | |
filename.parent.mkdir(parents=True, exist_ok=True) | |
response = await self.request( | |
method="GET", | |
url=url, | |
headers=self.headers, | |
) | |
# 302 重定向 | |
if response.status_code == 302: | |
url = response.headers["Location"] | |
return await self.download_video_by_url(filename, url) | |
elif response.status_code == 200: | |
with open(filename, "wb") as f: | |
f.write(response.content) | |
return filename | |
else: | |
raise AssertionError(f"Got status code {response.status_code}") | |
async def post_aweme_list(self, max_cursor: int = 0, count: int = 18): | |
url = "https://www.douyin.com/aweme/v1/web/aweme/post/" | |
params = { | |
"device_platform": "webapp", | |
"aid": "6383", | |
"channel": "channel_pc_web", | |
"sec_user_id": self.sec_user_id, | |
"max_cursor": max_cursor, | |
"count": count, | |
"publish_video_strategy_type": "2", | |
"version_code": "290100", | |
"version_name": "29.1.0", | |
} | |
response = await self.request( | |
method="GET", | |
url=url, | |
headers=self.headers, | |
params=params, | |
) | |
if response.status_code == 444: | |
# Access Denied | |
logger.info(f"[post_aweme_list] access denied, sleep {self.check_interval * 3}s to try again.") | |
await asyncio.sleep(self.check_interval * 3) | |
return await self.post_aweme_list(max_cursor=max_cursor, count=count) | |
elif response.status_code == 200 and len(response.text) == 0: | |
# Access Denied | |
logger.info(f"[post_aweme_list] maybe the access denied, sleep {self.check_interval * 10}s to try again.") | |
await asyncio.sleep(self.check_interval * 10) | |
return await self.post_aweme_list(max_cursor=max_cursor, count=count) | |
elif response.status_code != 200: | |
raise AssertionError(f"request failed, status_code: {response.status_code}, text: {response.text}") | |
# print(f"status code: {response.status_code}, text: {response.text}") | |
js = response.json() | |
aweme_list = js["aweme_list"] | |
result = list() | |
for aweme in aweme_list: | |
# aweme_ = json.dumps(aweme, ensure_ascii=False, indent=4) | |
# print(aweme_) | |
aweme_id = aweme["aweme_id"] | |
desc = aweme["desc"] | |
create_time = aweme["create_time"] | |
create_time_ = datetime.fromtimestamp(create_time) | |
create_time_str = create_time_.strftime("%Y%m%d_%H%M%S") | |
# video | |
video = aweme["video"] | |
video_url_list = video["play_addr"]["url_list"] | |
# tags | |
text_extra = aweme["text_extra"] | |
tags = set() | |
for t in text_extra: | |
tag = t.get("hashtag_name") | |
if tag is None: | |
tag = t.get("search_text") | |
if tag is None: | |
# print(t) | |
continue | |
tags.add(tag) | |
tags = list(tags) | |
# title | |
title: str = desc | |
for tag in tags: | |
title = title.replace(f"#{tag}", "") | |
# title = title.replace(f"# {tag}", "") | |
title = title.strip() | |
row = { | |
"aweme_id": aweme_id, | |
"create_time": create_time, | |
"create_time_str": create_time_str, | |
"title": title, | |
"desc": desc, | |
"video_url_list": video_url_list, | |
"tags": tags, | |
} | |
result.append(row) | |
return result | |
async def get_video_list_by_min_date(self, min_date: str = "2025-06-10 00:00:00"): | |
min_date_ = datetime.strptime(min_date, "%Y-%m-%d %H:%M:%S") | |
result = list() | |
stop_flag = False | |
max_cursor = 0 | |
for i in range(100): | |
if stop_flag: | |
break | |
rows = await self.post_aweme_list(max_cursor=max_cursor, count=18) | |
this_min_date_ = [datetime.fromtimestamp(row["create_time"]) < min_date_ for row in rows] | |
if all(this_min_date_): | |
break | |
for row in rows: | |
create_time = row["create_time"] | |
aweme_id = row["aweme_id"] | |
create_time_str = row["create_time_str"] | |
title = row["title"] | |
desc = row["desc"] | |
# video_url = row["video_url_list"][0] | |
video_url = random.sample(row["video_url_list"], k=1)[0] | |
tags = row["tags"] | |
max_cursor_ = int(create_time * 1000) | |
if max_cursor == 0 or max_cursor_ < max_cursor: | |
max_cursor = max_cursor_ | |
create_time_ = datetime.fromtimestamp(create_time) | |
if create_time_ > min_date_: | |
task = { | |
"aweme_id": aweme_id, | |
"create_time_str": create_time_str, | |
"title": title, | |
"desc": desc, | |
"video_url": video_url, | |
"tags": tags, | |
} | |
result.append(task) | |
return result | |
async def download_new_video_list_by_min_date(self, min_date: str = "2025-06-10 00:00:00"): | |
new_video_list = await self.get_video_list_by_min_date(min_date=min_date) | |
for new_video in new_video_list: | |
aweme_id = new_video["aweme_id"] | |
create_time_str = new_video["create_time_str"] | |
title = new_video["title"] | |
desc = new_video["desc"] | |
video_url = new_video["video_url"] | |
tags = new_video["tags"] | |
self.downloaded_video_aweme_id_set = await self.get_downloaded_video_aweme_id_set() | |
if aweme_id in self.downloaded_video_aweme_id_set: | |
continue | |
self.downloaded_video_title_set = await self.get_downloaded_video_title_set() | |
if title in self.downloaded_video_title_set: | |
continue | |
title_ = re.sub(r'[\\/:*?"<>|]', '_', title) | |
title_ = title_[:50] | |
filename = self.file_output_dir / f"[{aweme_id}][{create_time_str}]{title_}.mp4" | |
if not filename.exists(): | |
logger.info(f"download video; {self.flag}; " | |
f"aweme_id: {aweme_id}, create_time_str: {create_time_str}, " | |
f"title: {title_}" | |
f"desc: {desc}" | |
) | |
try: | |
await self.download_video_by_url(filename, video_url) | |
except Exception as e: | |
logger.error(f"download video failed; error type: {type(e)}, error text: {str(e)}, url: {video_url}") | |
continue | |
await self.save_downloaded_video_info(create_time_str, filename, video_url, aweme_id, title, desc, tags) | |
async def run(self): | |
await self.download_new_video_list_by_min_date(self.min_date) | |
return | |
async def start(self): | |
while True: | |
try: | |
await self.run() | |
logger.info(f"{self.flag}新视频检测... 刷新间隔 {self.check_interval}s") | |
await asyncio.sleep(self.check_interval) | |
except Exception as error: | |
logger.exception(f"{self.flag}新视频检测错误\n{repr(error)}") | |
await asyncio.sleep(self.check_interval) | |
continue | |
async def main(): | |
dy_downloader = DouyinDownloader( | |
platform="douyin", | |
user_name="douyin", | |
min_date="2025-06-10", | |
# sec_user_id="MS4wLjABAAAAQinRMLyQNYA45OYXoCDrwszhRGaDVirRE1fTNSaGGkc", | |
sec_user_id="MS4wLjABAAAATGoBrO7yiJ3q9go4fxq9JXjrnP1bFpdkgKckC1IpfXA_vrjSmL9ZtjmTju8ApwbT", | |
file_output_dir=(project_path / "data/video/douyin/陈杰森").as_posix(), | |
file_info_file=(project_path / "data/video/douyin/陈杰森/file_info.json").as_posix(), | |
) | |
# await dy_downloader.download_video_by_url( | |
# "temp.mp4", | |
# "https://www.douyin.com/aweme/v1/play/?video_id=v0200fg10000d0tqrhnog65ja591m5gg&line=0&file_id=0267d175119a4cf9b74bbd26ea8f88d3&sign=7a57b19cb3fd4cdfcee4ed345c7837dc&is_play_url=1&source=PackSourceEnum_PUBLISH" | |
# ) | |
await dy_downloader.download_new_video_list_by_min_date(min_date="2025-07-03 00:00:00") | |
return | |
if __name__ == "__main__": | |
asyncio.run(main()) | |