#!/usr/bin/python3 # -*- coding: utf-8 -*- import aiofiles import asyncio from datetime import datetime import httpx import json import logging from pathlib import Path import random import re import time from typing import Dict, List, Set from project_settings import project_path logger = logging.getLogger("toolbox") DEFAULT_HEADERS ={ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36", "referer": "https://www.douyin.com/", "cookie": "enter_pc_once=1; UIFID_TEMP=163eead721bc91ca6f3a3cb4766a73c0638fabced1012e02c28bcbf3169aca5c131be8508d5b2900b15c77026693ad016706daf3577246266fd7a479e2c70f0a3968ec59059e59b609708db349a17ec8; s_v_web_id=verify_mcq56tpq_wqZcsYMx_0lCj_4d3j_BTUO_TghtqFe3Awt5; hevc_supported=true; dy_swidth=1920; dy_sheight=1080; fpk1=U2FsdGVkX1+8cnptVaRUMZ4YRHZbX6C5V2sPGH4oztEfPUzJREBsPmoxqj8pFRNQCR/0dxJNHjjYKsV5DfjwLQ==; fpk2=5b2ba492da1bf8b88f5f71b161575820; passport_csrf_token=6e0bdaf8e3f77ddcf5e175a23ca3de43; passport_csrf_token_default=6e0bdaf8e3f77ddcf5e175a23ca3de43; __security_mc_1_s_sdk_crypt_sdk=e70bb4ad-40c3-a290; __security_mc_1_s_sdk_cert_key=b5792a4e-49a7-bd77; bd_ticket_guard_client_web_domain=2; d_ticket=585e449b8f1d99eacf78acdbe38c3c8e66074; n_mh=O0fKzxpW1JwZ-FhQ8TWI4BT0LewuwA38XumyGMbx894; uid_tt=e7a6d987a86a99e030cc79cb8602e990; uid_tt_ss=e7a6d987a86a99e030cc79cb8602e990; sid_tt=166574208279efc3c59c031c7130b41b; sessionid=166574208279efc3c59c031c7130b41b; sessionid_ss=166574208279efc3c59c031c7130b41b; session_tlb_tag=sttt%7C4%7CFmV0IIJ578PFnAMccTC0G_________-2-nDYOve9XfDYJfKlJaro-JQs3ftBRh-9gKAfVWv2e14%3D; is_staff_user=false; is_dash_user=1; __security_server_data_status=1; UIFID=163eead721bc91ca6f3a3cb4766a73c0638fabced1012e02c28bcbf3169aca5c131be8508d5b2900b15c77026693ad01e5a8d558171969e1966551b912cae503cba9c9e66be1ed40f28f8ddf18c9dc24d88b9fb2d777753990d5853c74777955fc3652eb8ab9f2ee67c3ffc8d0f340bcc77115073a26843525cad007257ad788d901915c7a4f2bac8d0c20d10ae4339c08bcf2750e19f7c92f4ef02baa9a856d; my_rd=2; xgplayer_device_id=77043468800; xgplayer_user_id=952792079155; live_use_vvc=%22false%22; passport_mfa_token=CjWll%2F%2FHDAp0ax6akf%2BMNx4U65LGPmGZOQVcnyy59TZStqgOayVGd5uTdnOWNGZU%2B0YBkUhP0RpKCjwAAAAAAAAAAAAAT019rQ8a6REnMySCyTAAVhFra5ZCBhIG6yI6xbSyb96Mym9ZOWNkBz7hhDVGAW9Eq54Qi6z4DRj2sdFsIAIiAQMylHAB; passport_assist_user=Cjyfe4luw56SFCDA0i3L6-360Pj1rJCvLzJZ_XyWevQdgZcD7FJ9_aS9V6529MzTTX2CfuhilG9ONfIJ_9caSgo8AAAAAAAAAAAAAE9NvuwN8mN1pZJGcGcI0_hTUSO2skqwqXjkSNurZy3sBYHtRHLak13ML4-WwcTIpMiCEIus-A0Yia_WVCABIgEDwMHL6g%3D%3D; passport_auth_status=1afdadf16f0a1b1f9e242a63bad2a4ad%2Cb719c4bdc913fd296c195ace9b4b5573; passport_auth_status_ss=1afdadf16f0a1b1f9e242a63bad2a4ad%2Cb719c4bdc913fd296c195ace9b4b5573; sid_guard=166574208279efc3c59c031c7130b41b%7C1754101144%7C5184000%7CWed%2C+01-Oct-2025+02%3A19%3A04+GMT; sid_ucp_v1=1.0.0-KDBiYmNmMDA5YmQyYjQ3OTNjYjc2NDRmMWY5ZWZiZmYxZTA2NGVlYWEKHwi8v5DIqQIQmOu1xAYY7zEgDDC3y7LRBTgCQPEHSAQaAmhsIiAxNjY1NzQyMDgyNzllZmMzYzU5YzAzMWM3MTMwYjQxYg; ssid_ucp_v1=1.0.0-KDBiYmNmMDA5YmQyYjQ3OTNjYjc2NDRmMWY5ZWZiZmYxZTA2NGVlYWEKHwi8v5DIqQIQmOu1xAYY7zEgDDC3y7LRBTgCQPEHSAQaAmhsIiAxNjY1NzQyMDgyNzllZmMzYzU5YzAzMWM3MTMwYjQxYg; login_time=1754101144621; _bd_ticket_crypt_cookie=9f1aa50dacbba771e10f850990ecc56c; __security_mc_1_s_sdk_sign_data_key_web_protect=853b5c52-4170-98c8; SelfTabRedDotControl=%5B%7B%22id%22%3A%227510617703292667931%22%2C%22u%22%3A11%2C%22c%22%3A0%7D%2C%7B%22id%22%3A%227076981218884978725%22%2C%22u%22%3A511%2C%22c%22%3A0%7D%5D; __druidClientInfo=JTdCJTIyY2xpZW50V2lkdGglMjIlM0EyOTglMkMlMjJjbGllbnRIZWlnaHQlMjIlM0E1MzglMkMlMjJ3aWR0aCUyMiUzQTI5OCUyQyUyMmhlaWdodCUyMiUzQTUzOCUyQyUyMmRldmljZVBpeGVsUmF0aW8lMjIlM0ExJTJDJTIydXNlckFnZW50JTIyJTNBJTIyTW96aWxsYSUyRjUuMCUyMChNYWNpbnRvc2glM0IlMjBJbnRlbCUyME1hYyUyME9TJTIwWCUyMDEwXzE1XzcpJTIwQXBwbGVXZWJLaXQlMkY1MzcuMzYlMjAoS0hUTUwlMkMlMjBsaWtlJTIwR2Vja28pJTIwQ2hyb21lJTJGMTM4LjAuMC4wJTIwU2FmYXJpJTJGNTM3LjM2JTIyJTdE; volume_info=%7B%22volume%22%3A0.778%2C%22isMute%22%3Afalse%2C%22isUserMute%22%3Afalse%7D; publish_badge_show_info=%220%2C0%2C0%2C1754911178612%22; __live_version__=%221.1.3.7143%22; live_can_add_dy_2_desktop=%221%22; download_guide=%220%2F%2F1%22; WallpaperGuide=%7B%22showTime%22%3A0%2C%22closeTime%22%3A0%2C%22showCount%22%3A0%2C%22cursor1%22%3A160%2C%22cursor2%22%3A50%2C%22hoverTime%22%3A1752156303322%7D; strategyABtestKey=%221755213526.48%22; __ac_nonce=0689f18390096fdb3ee8f; __ac_signature=_02B4Z6wo00f01zYpLFgAAIDBBbzaGE6bBP82CSjAAKUf26; douyin.com; xg_device_score=7.547739235768296; device_web_cpu_core=8; device_web_memory_size=8; gulu_source_res=eyJwX2luIjoiYmY5ZDgyN2ZlMmQ3ZWYyOGU3ZjJmZGI4NTdhYTAxZGNlMjNlMTViMmRkZDM5ODM0MzJiMzE3NjA2OGU3OTEyNiJ9; stream_player_status_params=%22%7B%5C%22is_auto_play%5C%22%3A0%2C%5C%22is_full_screen%5C%22%3A0%2C%5C%22is_full_webscreen%5C%22%3A1%2C%5C%22is_mute%5C%22%3A0%2C%5C%22is_speed%5C%22%3A1%2C%5C%22is_visible%5C%22%3A0%7D%22; FOLLOW_LIVE_POINT_INFO=%22MS4wLjABAAAARchEEl9h74Ky-zMgdyLxIzDgZlhHOO3KB-V5h_nhI_Q%2F1755273600000%2F0%2F0%2F1755257556086%22; FOLLOW_NUMBER_YELLOW_POINT_INFO=%22MS4wLjABAAAARchEEl9h74Ky-zMgdyLxIzDgZlhHOO3KB-V5h_nhI_Q%2F1755273600000%2F0%2F1755256956086%2F0%22; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCRVFyUndCSkxSakF2K3g4SzQwZXFoWlNpUksrT3NGTEJIQzh1dWtMUzE2RXRCWThMbExkejBUY3RsL2c4SXdpYldTbThsZFZoaWIzUEs0OXArb0dJSlE9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoyfQ%3D%3D; ttwid=1%7CN05Iw2OUzw_zu1lcgD2Nnt7DlhY3U6BnjTZUnGZMlj4%7C1755256961%7C86396afa0f78587ebc26f398c5c8335f390cfbd081c341af6a4325fd36f3c860; odin_tt=67e15087730b08cd9ce726980428ef07821c09faef000e0a5a94bebdaac917118c8a3761bf34d1eb4458a399f7ead945; biz_trace_id=84799947; sdk_source_info=7e276470716a68645a606960273f276364697660272927676c715a6d6069756077273f276364697660272927666d776a68605a607d71606b766c6a6b5a7666776c7571273f275e58272927666a6b766a69605a696c6061273f27636469766027292762696a6764695a7364776c6467696076273f275e582729277672715a646971273f2763646976602729277f6b5a666475273f2763646976602729276d6a6e5a6b6a716c273f2763646976602729276c6b6f5a7f6367273f27636469766027292771273f27323036373d3632303730303234272927676c715a75776a716a666a69273f2763646976602778; bit_env=ZaZWWZ1txljN5I_WfKkXqlt1dYQaaJjlDN4lVYH-oVy4OD8ZWpqdiRqqCQeQPoVaO0sQ2QloHyGtrbR1AIBn0Iea2Xn0gLqYrQsGvNZXUv8LTupNgCFq_RRIDpiB5GtvIag0RdQ0xASNwhpHHP7v9HttY2RjnX5mAHVMsL9I5Yg_0FXr22MftQcGS1G15ixTe8sEX2u-97oyTaGL7FJ6dYVtbTajxl74CePYTw6-U2VWweQ772lHZJ5wpkxlt46xAzWB8dbVThrgwuDNx0TCnuHY9tWRFJAV67pbbO48__c3dqTjfqFfVlCaaQwLEJ9O7vKXYpmn9Jp2Q_nMKJRlT3xEP_QgbZyzc5TUHaktZMuECUU1PqOGy-zpOvPQSUQKCW5RQVeklOubwfrDHN4ppYc9oMXdJc8TjJ7-9kQ14cG6rPXRcyQZaJdIQnlPF_59odjueeeb4k14N8VZ4mvu11pc3ntm426tVEw52-R21YEfDjWAxStsmfYRJ8yQKda2cLmvfvyGv6FRtc7ykBJGBdevuvj3Ng3vCynGIHNOQ4g%3D; passport_auth_mix_state=4zlyzmvyoew0ppa2qqsnuog7k19i3msu; IsDouyinActive=true; home_can_add_dy_2_desktop=%220%22; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1920%2C%5C%22screen_height%5C%22%3A1080%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A8%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A6.25%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A100%7D%22" } class DouyinDownloader(object): def __init__(self, platform: str, user_name: str, sec_user_id: str, min_date: str = None, headers: dict = None, cookies: str = None, check_interval: int = 10*60, file_output_fmt: str = "mp4", file_output_dir: str = "output", file_info_file: str = "file_info.json", ): self.platform = platform self.user_name = user_name self.sec_user_id = sec_user_id self.min_date = min_date or time.strftime("%Y-%m-%d %H:%M:%S") self.headers = headers or DEFAULT_HEADERS self.cookies = cookies self.check_interval = check_interval self.file_output_fmt = file_output_fmt self.file_output_dir: Path = Path(file_output_dir) self.file_info_file: Path = Path(file_info_file) self.flag = f"[{self.__class__.__name__}][{self.platform}][{self.user_name}]" # state self.run_recording = False self.downloaded_video_aweme_id_set = None self.downloaded_video_title_set = None # client self.client: httpx.AsyncClient = self.get_client() def get_client(self) -> httpx.AsyncClient: client = httpx.AsyncClient( http2=True, timeout=self.check_interval, limits=httpx.Limits(max_keepalive_connections=100, keepalive_expiry=self.check_interval * 2), headers=self.headers, cookies=self.cookies, ) return client async def request(self, method: str, url: str, **kwargs): try: response = await self.client.request(method, url, **kwargs) return response except httpx.ReadError as e: raise ConnectionError(f"request failed; error type: {type(e)}, error text: {str(e)}, url: {url}") except httpx.ConnectError as e: raise ConnectionError(f"request failed; error type: {type(e)}, error text: {str(e)}, url: {url}") except httpx.RemoteProtocolError as e: raise ConnectionError(f"request failed; error type: {type(e)}, error text: {str(e)}, url: {url}") async def save_downloaded_video_info(self, create_time: str, filename: Path, url: str, video_id: str, title: str, desc: str, tags: List[str]) -> str: video_info = await self.load_downloaded_video_info() video_info[video_id] = { "create_time": create_time, "filename": filename.as_posix(), "title": title, "desc": desc, "tags": tags, "url": url, "video_id": video_id, } async with aiofiles.open(self.file_info_file.as_posix(), "w", encoding="utf-8") as f: video_info_ = json.dumps(video_info, ensure_ascii=False, indent=2) await f.write(f"{video_info_}\n") # json.dump(video_info, f, ensure_ascii=False, indent=2) return self.file_info_file.as_posix() async def load_downloaded_video_info(self) -> Dict[str, dict]: video_info = dict() if self.file_info_file.exists(): async with aiofiles.open(self.file_info_file.as_posix(), "r", encoding="utf-8") as f: data = await f.read() video_info: dict = json.loads(data) return video_info async def get_downloaded_video_aweme_id_set(self) -> Set[str]: video_info_dict: dict = await self.load_downloaded_video_info() aweme_id_set = set(video_info_dict.keys()) return aweme_id_set async def get_downloaded_video_title_set(self) -> Set[str]: video_info_dict: dict = await self.load_downloaded_video_info() title_set = set() for k, v in video_info_dict.items(): title = v["title"] title_set.add(title) return title_set async def download_video_by_url(self, filename: Path, url: str): filename = Path(filename) filename.parent.mkdir(parents=True, exist_ok=True) response = await self.request( method="GET", url=url, headers=self.headers, ) # 302 重定向 if response.status_code == 302: url = response.headers["Location"] return await self.download_video_by_url(filename, url) elif response.status_code == 200: with open(filename, "wb") as f: f.write(response.content) return filename else: raise AssertionError(f"Got status code {response.status_code}") async def post_aweme_list(self, max_cursor: int = 0, count: int = 18): url = "https://www.douyin.com/aweme/v1/web/aweme/post/" params = { "device_platform": "webapp", "aid": "6383", "channel": "channel_pc_web", "sec_user_id": self.sec_user_id, "max_cursor": max_cursor, "count": count, "publish_video_strategy_type": "2", "version_code": "290100", "version_name": "29.1.0", } response = await self.request( method="GET", url=url, headers=self.headers, params=params, ) if response.status_code == 444: # Access Denied logger.info(f"[post_aweme_list] access denied, sleep {self.check_interval * 3}s to try again.") await asyncio.sleep(self.check_interval * 3) return await self.post_aweme_list(max_cursor=max_cursor, count=count) elif response.status_code == 200 and len(response.text) == 0: # Access Denied logger.info(f"[post_aweme_list] maybe the access denied, sleep {self.check_interval * 10}s to try again.") await asyncio.sleep(self.check_interval * 10) return await self.post_aweme_list(max_cursor=max_cursor, count=count) elif response.status_code != 200: raise AssertionError(f"request failed, status_code: {response.status_code}, text: {response.text}") # print(f"status code: {response.status_code}, text: {response.text}") js = response.json() aweme_list = js["aweme_list"] result = list() for aweme in aweme_list: # aweme_ = json.dumps(aweme, ensure_ascii=False, indent=4) # print(aweme_) aweme_id = aweme["aweme_id"] desc = aweme["desc"] create_time = aweme["create_time"] create_time_ = datetime.fromtimestamp(create_time) create_time_str = create_time_.strftime("%Y%m%d_%H%M%S") # video video = aweme["video"] video_url_list = video["play_addr"]["url_list"] # tags text_extra = aweme["text_extra"] tags = set() for t in text_extra: tag = t.get("hashtag_name") if tag is None: tag = t.get("search_text") if tag is None: # print(t) continue tags.add(tag) tags = list(tags) # title title: str = desc for tag in tags: title = title.replace(f"#{tag}", "") # title = title.replace(f"# {tag}", "") title = title.strip() row = { "aweme_id": aweme_id, "create_time": create_time, "create_time_str": create_time_str, "title": title, "desc": desc, "video_url_list": video_url_list, "tags": tags, } result.append(row) return result async def get_video_list_by_min_date(self, min_date: str = "2025-06-10 00:00:00"): min_date_ = datetime.strptime(min_date, "%Y-%m-%d %H:%M:%S") result = list() stop_flag = False max_cursor = 0 for i in range(100): if stop_flag: break rows = await self.post_aweme_list(max_cursor=max_cursor, count=18) this_min_date_ = [datetime.fromtimestamp(row["create_time"]) < min_date_ for row in rows] if all(this_min_date_): break for row in rows: create_time = row["create_time"] aweme_id = row["aweme_id"] create_time_str = row["create_time_str"] title = row["title"] desc = row["desc"] # video_url = row["video_url_list"][0] video_url = random.sample(row["video_url_list"], k=1)[0] tags = row["tags"] max_cursor_ = int(create_time * 1000) if max_cursor == 0 or max_cursor_ < max_cursor: max_cursor = max_cursor_ create_time_ = datetime.fromtimestamp(create_time) if create_time_ > min_date_: task = { "aweme_id": aweme_id, "create_time_str": create_time_str, "title": title, "desc": desc, "video_url": video_url, "tags": tags, } result.append(task) return result async def download_new_video_list_by_min_date(self, min_date: str = "2025-06-10 00:00:00"): new_video_list = await self.get_video_list_by_min_date(min_date=min_date) for new_video in new_video_list: aweme_id = new_video["aweme_id"] create_time_str = new_video["create_time_str"] title = new_video["title"] desc = new_video["desc"] video_url = new_video["video_url"] tags = new_video["tags"] self.downloaded_video_aweme_id_set = await self.get_downloaded_video_aweme_id_set() if aweme_id in self.downloaded_video_aweme_id_set: continue self.downloaded_video_title_set = await self.get_downloaded_video_title_set() if title in self.downloaded_video_title_set: continue title_ = re.sub(r'[\\/:*?"<>|]', '_', title) title_ = title_[:50] filename = self.file_output_dir / f"[{aweme_id}][{create_time_str}]{title_}.mp4" if not filename.exists(): logger.info(f"download video; {self.flag}; " f"aweme_id: {aweme_id}, create_time_str: {create_time_str}, " f"title: {title_}" f"desc: {desc}" ) try: await self.download_video_by_url(filename, video_url) except Exception as e: logger.error(f"download video failed; error type: {type(e)}, error text: {str(e)}, url: {video_url}") continue await self.save_downloaded_video_info(create_time_str, filename, video_url, aweme_id, title, desc, tags) async def run(self): await self.download_new_video_list_by_min_date(self.min_date) return async def start(self): while True: try: await self.run() logger.info(f"{self.flag}新视频检测... 刷新间隔 {self.check_interval}s") await asyncio.sleep(self.check_interval) except Exception as error: logger.exception(f"{self.flag}新视频检测错误\n{repr(error)}") await asyncio.sleep(self.check_interval) continue async def main(): dy_downloader = DouyinDownloader( platform="douyin", user_name="douyin", min_date="2025-06-10", # sec_user_id="MS4wLjABAAAAQinRMLyQNYA45OYXoCDrwszhRGaDVirRE1fTNSaGGkc", sec_user_id="MS4wLjABAAAATGoBrO7yiJ3q9go4fxq9JXjrnP1bFpdkgKckC1IpfXA_vrjSmL9ZtjmTju8ApwbT", file_output_dir=(project_path / "data/video/douyin/陈杰森").as_posix(), file_info_file=(project_path / "data/video/douyin/陈杰森/file_info.json").as_posix(), ) # await dy_downloader.download_video_by_url( # "temp.mp4", # "https://www.douyin.com/aweme/v1/play/?video_id=v0200fg10000d0tqrhnog65ja591m5gg&line=0&file_id=0267d175119a4cf9b74bbd26ea8f88d3&sign=7a57b19cb3fd4cdfcee4ed345c7837dc&is_play_url=1&source=PackSourceEnum_PUBLISH" # ) await dy_downloader.download_new_video_list_by_min_date(min_date="2025-07-03 00:00:00") return if __name__ == "__main__": asyncio.run(main())