qgyd2021's picture
first commit
f176037
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import aiofiles
import asyncio
from datetime import datetime
import httpx
import json
import logging
from pathlib import Path
import random
import re
import time
from typing import Dict, List, Set
from project_settings import project_path
logger = logging.getLogger("toolbox")
DEFAULT_HEADERS ={
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
"referer": "https://www.douyin.com/",
"cookie": "enter_pc_once=1; UIFID_TEMP=163eead721bc91ca6f3a3cb4766a73c0638fabced1012e02c28bcbf3169aca5c131be8508d5b2900b15c77026693ad016706daf3577246266fd7a479e2c70f0a3968ec59059e59b609708db349a17ec8; s_v_web_id=verify_mcq56tpq_wqZcsYMx_0lCj_4d3j_BTUO_TghtqFe3Awt5; hevc_supported=true; dy_swidth=1920; dy_sheight=1080; fpk1=U2FsdGVkX1+8cnptVaRUMZ4YRHZbX6C5V2sPGH4oztEfPUzJREBsPmoxqj8pFRNQCR/0dxJNHjjYKsV5DfjwLQ==; fpk2=5b2ba492da1bf8b88f5f71b161575820; passport_csrf_token=6e0bdaf8e3f77ddcf5e175a23ca3de43; passport_csrf_token_default=6e0bdaf8e3f77ddcf5e175a23ca3de43; __security_mc_1_s_sdk_crypt_sdk=e70bb4ad-40c3-a290; __security_mc_1_s_sdk_cert_key=b5792a4e-49a7-bd77; bd_ticket_guard_client_web_domain=2; d_ticket=585e449b8f1d99eacf78acdbe38c3c8e66074; n_mh=O0fKzxpW1JwZ-FhQ8TWI4BT0LewuwA38XumyGMbx894; uid_tt=e7a6d987a86a99e030cc79cb8602e990; uid_tt_ss=e7a6d987a86a99e030cc79cb8602e990; sid_tt=166574208279efc3c59c031c7130b41b; sessionid=166574208279efc3c59c031c7130b41b; sessionid_ss=166574208279efc3c59c031c7130b41b; session_tlb_tag=sttt%7C4%7CFmV0IIJ578PFnAMccTC0G_________-2-nDYOve9XfDYJfKlJaro-JQs3ftBRh-9gKAfVWv2e14%3D; is_staff_user=false; is_dash_user=1; __security_server_data_status=1; UIFID=163eead721bc91ca6f3a3cb4766a73c0638fabced1012e02c28bcbf3169aca5c131be8508d5b2900b15c77026693ad01e5a8d558171969e1966551b912cae503cba9c9e66be1ed40f28f8ddf18c9dc24d88b9fb2d777753990d5853c74777955fc3652eb8ab9f2ee67c3ffc8d0f340bcc77115073a26843525cad007257ad788d901915c7a4f2bac8d0c20d10ae4339c08bcf2750e19f7c92f4ef02baa9a856d; my_rd=2; xgplayer_device_id=77043468800; xgplayer_user_id=952792079155; live_use_vvc=%22false%22; passport_mfa_token=CjWll%2F%2FHDAp0ax6akf%2BMNx4U65LGPmGZOQVcnyy59TZStqgOayVGd5uTdnOWNGZU%2B0YBkUhP0RpKCjwAAAAAAAAAAAAAT019rQ8a6REnMySCyTAAVhFra5ZCBhIG6yI6xbSyb96Mym9ZOWNkBz7hhDVGAW9Eq54Qi6z4DRj2sdFsIAIiAQMylHAB; passport_assist_user=Cjyfe4luw56SFCDA0i3L6-360Pj1rJCvLzJZ_XyWevQdgZcD7FJ9_aS9V6529MzTTX2CfuhilG9ONfIJ_9caSgo8AAAAAAAAAAAAAE9NvuwN8mN1pZJGcGcI0_hTUSO2skqwqXjkSNurZy3sBYHtRHLak13ML4-WwcTIpMiCEIus-A0Yia_WVCABIgEDwMHL6g%3D%3D; passport_auth_status=1afdadf16f0a1b1f9e242a63bad2a4ad%2Cb719c4bdc913fd296c195ace9b4b5573; passport_auth_status_ss=1afdadf16f0a1b1f9e242a63bad2a4ad%2Cb719c4bdc913fd296c195ace9b4b5573; sid_guard=166574208279efc3c59c031c7130b41b%7C1754101144%7C5184000%7CWed%2C+01-Oct-2025+02%3A19%3A04+GMT; sid_ucp_v1=1.0.0-KDBiYmNmMDA5YmQyYjQ3OTNjYjc2NDRmMWY5ZWZiZmYxZTA2NGVlYWEKHwi8v5DIqQIQmOu1xAYY7zEgDDC3y7LRBTgCQPEHSAQaAmhsIiAxNjY1NzQyMDgyNzllZmMzYzU5YzAzMWM3MTMwYjQxYg; ssid_ucp_v1=1.0.0-KDBiYmNmMDA5YmQyYjQ3OTNjYjc2NDRmMWY5ZWZiZmYxZTA2NGVlYWEKHwi8v5DIqQIQmOu1xAYY7zEgDDC3y7LRBTgCQPEHSAQaAmhsIiAxNjY1NzQyMDgyNzllZmMzYzU5YzAzMWM3MTMwYjQxYg; login_time=1754101144621; _bd_ticket_crypt_cookie=9f1aa50dacbba771e10f850990ecc56c; __security_mc_1_s_sdk_sign_data_key_web_protect=853b5c52-4170-98c8; SelfTabRedDotControl=%5B%7B%22id%22%3A%227510617703292667931%22%2C%22u%22%3A11%2C%22c%22%3A0%7D%2C%7B%22id%22%3A%227076981218884978725%22%2C%22u%22%3A511%2C%22c%22%3A0%7D%5D; __druidClientInfo=JTdCJTIyY2xpZW50V2lkdGglMjIlM0EyOTglMkMlMjJjbGllbnRIZWlnaHQlMjIlM0E1MzglMkMlMjJ3aWR0aCUyMiUzQTI5OCUyQyUyMmhlaWdodCUyMiUzQTUzOCUyQyUyMmRldmljZVBpeGVsUmF0aW8lMjIlM0ExJTJDJTIydXNlckFnZW50JTIyJTNBJTIyTW96aWxsYSUyRjUuMCUyMChNYWNpbnRvc2glM0IlMjBJbnRlbCUyME1hYyUyME9TJTIwWCUyMDEwXzE1XzcpJTIwQXBwbGVXZWJLaXQlMkY1MzcuMzYlMjAoS0hUTUwlMkMlMjBsaWtlJTIwR2Vja28pJTIwQ2hyb21lJTJGMTM4LjAuMC4wJTIwU2FmYXJpJTJGNTM3LjM2JTIyJTdE; volume_info=%7B%22volume%22%3A0.778%2C%22isMute%22%3Afalse%2C%22isUserMute%22%3Afalse%7D; publish_badge_show_info=%220%2C0%2C0%2C1754911178612%22; __live_version__=%221.1.3.7143%22; live_can_add_dy_2_desktop=%221%22; download_guide=%220%2F%2F1%22; WallpaperGuide=%7B%22showTime%22%3A0%2C%22closeTime%22%3A0%2C%22showCount%22%3A0%2C%22cursor1%22%3A160%2C%22cursor2%22%3A50%2C%22hoverTime%22%3A1752156303322%7D; strategyABtestKey=%221755213526.48%22; __ac_nonce=0689f18390096fdb3ee8f; __ac_signature=_02B4Z6wo00f01zYpLFgAAIDBBbzaGE6bBP82CSjAAKUf26; douyin.com; xg_device_score=7.547739235768296; device_web_cpu_core=8; device_web_memory_size=8; gulu_source_res=eyJwX2luIjoiYmY5ZDgyN2ZlMmQ3ZWYyOGU3ZjJmZGI4NTdhYTAxZGNlMjNlMTViMmRkZDM5ODM0MzJiMzE3NjA2OGU3OTEyNiJ9; stream_player_status_params=%22%7B%5C%22is_auto_play%5C%22%3A0%2C%5C%22is_full_screen%5C%22%3A0%2C%5C%22is_full_webscreen%5C%22%3A1%2C%5C%22is_mute%5C%22%3A0%2C%5C%22is_speed%5C%22%3A1%2C%5C%22is_visible%5C%22%3A0%7D%22; FOLLOW_LIVE_POINT_INFO=%22MS4wLjABAAAARchEEl9h74Ky-zMgdyLxIzDgZlhHOO3KB-V5h_nhI_Q%2F1755273600000%2F0%2F0%2F1755257556086%22; FOLLOW_NUMBER_YELLOW_POINT_INFO=%22MS4wLjABAAAARchEEl9h74Ky-zMgdyLxIzDgZlhHOO3KB-V5h_nhI_Q%2F1755273600000%2F0%2F1755256956086%2F0%22; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCRVFyUndCSkxSakF2K3g4SzQwZXFoWlNpUksrT3NGTEJIQzh1dWtMUzE2RXRCWThMbExkejBUY3RsL2c4SXdpYldTbThsZFZoaWIzUEs0OXArb0dJSlE9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoyfQ%3D%3D; ttwid=1%7CN05Iw2OUzw_zu1lcgD2Nnt7DlhY3U6BnjTZUnGZMlj4%7C1755256961%7C86396afa0f78587ebc26f398c5c8335f390cfbd081c341af6a4325fd36f3c860; odin_tt=67e15087730b08cd9ce726980428ef07821c09faef000e0a5a94bebdaac917118c8a3761bf34d1eb4458a399f7ead945; biz_trace_id=84799947; sdk_source_info=7e276470716a68645a606960273f276364697660272927676c715a6d6069756077273f276364697660272927666d776a68605a607d71606b766c6a6b5a7666776c7571273f275e58272927666a6b766a69605a696c6061273f27636469766027292762696a6764695a7364776c6467696076273f275e582729277672715a646971273f2763646976602729277f6b5a666475273f2763646976602729276d6a6e5a6b6a716c273f2763646976602729276c6b6f5a7f6367273f27636469766027292771273f27323036373d3632303730303234272927676c715a75776a716a666a69273f2763646976602778; bit_env=ZaZWWZ1txljN5I_WfKkXqlt1dYQaaJjlDN4lVYH-oVy4OD8ZWpqdiRqqCQeQPoVaO0sQ2QloHyGtrbR1AIBn0Iea2Xn0gLqYrQsGvNZXUv8LTupNgCFq_RRIDpiB5GtvIag0RdQ0xASNwhpHHP7v9HttY2RjnX5mAHVMsL9I5Yg_0FXr22MftQcGS1G15ixTe8sEX2u-97oyTaGL7FJ6dYVtbTajxl74CePYTw6-U2VWweQ772lHZJ5wpkxlt46xAzWB8dbVThrgwuDNx0TCnuHY9tWRFJAV67pbbO48__c3dqTjfqFfVlCaaQwLEJ9O7vKXYpmn9Jp2Q_nMKJRlT3xEP_QgbZyzc5TUHaktZMuECUU1PqOGy-zpOvPQSUQKCW5RQVeklOubwfrDHN4ppYc9oMXdJc8TjJ7-9kQ14cG6rPXRcyQZaJdIQnlPF_59odjueeeb4k14N8VZ4mvu11pc3ntm426tVEw52-R21YEfDjWAxStsmfYRJ8yQKda2cLmvfvyGv6FRtc7ykBJGBdevuvj3Ng3vCynGIHNOQ4g%3D; passport_auth_mix_state=4zlyzmvyoew0ppa2qqsnuog7k19i3msu; IsDouyinActive=true; home_can_add_dy_2_desktop=%220%22; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1920%2C%5C%22screen_height%5C%22%3A1080%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A8%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A6.25%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A100%7D%22"
}
class DouyinDownloader(object):
def __init__(self,
platform: str,
user_name: str,
sec_user_id: str,
min_date: str = None,
headers: dict = None,
cookies: str = None,
check_interval: int = 10*60,
file_output_fmt: str = "mp4",
file_output_dir: str = "output",
file_info_file: str = "file_info.json",
):
self.platform = platform
self.user_name = user_name
self.sec_user_id = sec_user_id
self.min_date = min_date or time.strftime("%Y-%m-%d %H:%M:%S")
self.headers = headers or DEFAULT_HEADERS
self.cookies = cookies
self.check_interval = check_interval
self.file_output_fmt = file_output_fmt
self.file_output_dir: Path = Path(file_output_dir)
self.file_info_file: Path = Path(file_info_file)
self.flag = f"[{self.__class__.__name__}][{self.platform}][{self.user_name}]"
# state
self.run_recording = False
self.downloaded_video_aweme_id_set = None
self.downloaded_video_title_set = None
# client
self.client: httpx.AsyncClient = self.get_client()
def get_client(self) -> httpx.AsyncClient:
client = httpx.AsyncClient(
http2=True,
timeout=self.check_interval,
limits=httpx.Limits(max_keepalive_connections=100, keepalive_expiry=self.check_interval * 2),
headers=self.headers,
cookies=self.cookies,
)
return client
async def request(self, method: str, url: str, **kwargs):
try:
response = await self.client.request(method, url, **kwargs)
return response
except httpx.ReadError as e:
raise ConnectionError(f"request failed; error type: {type(e)}, error text: {str(e)}, url: {url}")
except httpx.ConnectError as e:
raise ConnectionError(f"request failed; error type: {type(e)}, error text: {str(e)}, url: {url}")
except httpx.RemoteProtocolError as e:
raise ConnectionError(f"request failed; error type: {type(e)}, error text: {str(e)}, url: {url}")
async def save_downloaded_video_info(self, create_time: str, filename: Path, url: str, video_id: str, title: str, desc: str, tags: List[str]) -> str:
video_info = await self.load_downloaded_video_info()
video_info[video_id] = {
"create_time": create_time,
"filename": filename.as_posix(),
"title": title,
"desc": desc,
"tags": tags,
"url": url,
"video_id": video_id,
}
async with aiofiles.open(self.file_info_file.as_posix(), "w", encoding="utf-8") as f:
video_info_ = json.dumps(video_info, ensure_ascii=False, indent=2)
await f.write(f"{video_info_}\n")
# json.dump(video_info, f, ensure_ascii=False, indent=2)
return self.file_info_file.as_posix()
async def load_downloaded_video_info(self) -> Dict[str, dict]:
video_info = dict()
if self.file_info_file.exists():
async with aiofiles.open(self.file_info_file.as_posix(), "r", encoding="utf-8") as f:
data = await f.read()
video_info: dict = json.loads(data)
return video_info
async def get_downloaded_video_aweme_id_set(self) -> Set[str]:
video_info_dict: dict = await self.load_downloaded_video_info()
aweme_id_set = set(video_info_dict.keys())
return aweme_id_set
async def get_downloaded_video_title_set(self) -> Set[str]:
video_info_dict: dict = await self.load_downloaded_video_info()
title_set = set()
for k, v in video_info_dict.items():
title = v["title"]
title_set.add(title)
return title_set
async def download_video_by_url(self, filename: Path, url: str):
filename = Path(filename)
filename.parent.mkdir(parents=True, exist_ok=True)
response = await self.request(
method="GET",
url=url,
headers=self.headers,
)
# 302 重定向
if response.status_code == 302:
url = response.headers["Location"]
return await self.download_video_by_url(filename, url)
elif response.status_code == 200:
with open(filename, "wb") as f:
f.write(response.content)
return filename
else:
raise AssertionError(f"Got status code {response.status_code}")
async def post_aweme_list(self, max_cursor: int = 0, count: int = 18):
url = "https://www.douyin.com/aweme/v1/web/aweme/post/"
params = {
"device_platform": "webapp",
"aid": "6383",
"channel": "channel_pc_web",
"sec_user_id": self.sec_user_id,
"max_cursor": max_cursor,
"count": count,
"publish_video_strategy_type": "2",
"version_code": "290100",
"version_name": "29.1.0",
}
response = await self.request(
method="GET",
url=url,
headers=self.headers,
params=params,
)
if response.status_code == 444:
# Access Denied
logger.info(f"[post_aweme_list] access denied, sleep {self.check_interval * 3}s to try again.")
await asyncio.sleep(self.check_interval * 3)
return await self.post_aweme_list(max_cursor=max_cursor, count=count)
elif response.status_code == 200 and len(response.text) == 0:
# Access Denied
logger.info(f"[post_aweme_list] maybe the access denied, sleep {self.check_interval * 10}s to try again.")
await asyncio.sleep(self.check_interval * 10)
return await self.post_aweme_list(max_cursor=max_cursor, count=count)
elif response.status_code != 200:
raise AssertionError(f"request failed, status_code: {response.status_code}, text: {response.text}")
# print(f"status code: {response.status_code}, text: {response.text}")
js = response.json()
aweme_list = js["aweme_list"]
result = list()
for aweme in aweme_list:
# aweme_ = json.dumps(aweme, ensure_ascii=False, indent=4)
# print(aweme_)
aweme_id = aweme["aweme_id"]
desc = aweme["desc"]
create_time = aweme["create_time"]
create_time_ = datetime.fromtimestamp(create_time)
create_time_str = create_time_.strftime("%Y%m%d_%H%M%S")
# video
video = aweme["video"]
video_url_list = video["play_addr"]["url_list"]
# tags
text_extra = aweme["text_extra"]
tags = set()
for t in text_extra:
tag = t.get("hashtag_name")
if tag is None:
tag = t.get("search_text")
if tag is None:
# print(t)
continue
tags.add(tag)
tags = list(tags)
# title
title: str = desc
for tag in tags:
title = title.replace(f"#{tag}", "")
# title = title.replace(f"# {tag}", "")
title = title.strip()
row = {
"aweme_id": aweme_id,
"create_time": create_time,
"create_time_str": create_time_str,
"title": title,
"desc": desc,
"video_url_list": video_url_list,
"tags": tags,
}
result.append(row)
return result
async def get_video_list_by_min_date(self, min_date: str = "2025-06-10 00:00:00"):
min_date_ = datetime.strptime(min_date, "%Y-%m-%d %H:%M:%S")
result = list()
stop_flag = False
max_cursor = 0
for i in range(100):
if stop_flag:
break
rows = await self.post_aweme_list(max_cursor=max_cursor, count=18)
this_min_date_ = [datetime.fromtimestamp(row["create_time"]) < min_date_ for row in rows]
if all(this_min_date_):
break
for row in rows:
create_time = row["create_time"]
aweme_id = row["aweme_id"]
create_time_str = row["create_time_str"]
title = row["title"]
desc = row["desc"]
# video_url = row["video_url_list"][0]
video_url = random.sample(row["video_url_list"], k=1)[0]
tags = row["tags"]
max_cursor_ = int(create_time * 1000)
if max_cursor == 0 or max_cursor_ < max_cursor:
max_cursor = max_cursor_
create_time_ = datetime.fromtimestamp(create_time)
if create_time_ > min_date_:
task = {
"aweme_id": aweme_id,
"create_time_str": create_time_str,
"title": title,
"desc": desc,
"video_url": video_url,
"tags": tags,
}
result.append(task)
return result
async def download_new_video_list_by_min_date(self, min_date: str = "2025-06-10 00:00:00"):
new_video_list = await self.get_video_list_by_min_date(min_date=min_date)
for new_video in new_video_list:
aweme_id = new_video["aweme_id"]
create_time_str = new_video["create_time_str"]
title = new_video["title"]
desc = new_video["desc"]
video_url = new_video["video_url"]
tags = new_video["tags"]
self.downloaded_video_aweme_id_set = await self.get_downloaded_video_aweme_id_set()
if aweme_id in self.downloaded_video_aweme_id_set:
continue
self.downloaded_video_title_set = await self.get_downloaded_video_title_set()
if title in self.downloaded_video_title_set:
continue
title_ = re.sub(r'[\\/:*?"<>|]', '_', title)
title_ = title_[:50]
filename = self.file_output_dir / f"[{aweme_id}][{create_time_str}]{title_}.mp4"
if not filename.exists():
logger.info(f"download video; {self.flag}; "
f"aweme_id: {aweme_id}, create_time_str: {create_time_str}, "
f"title: {title_}"
f"desc: {desc}"
)
try:
await self.download_video_by_url(filename, video_url)
except Exception as e:
logger.error(f"download video failed; error type: {type(e)}, error text: {str(e)}, url: {video_url}")
continue
await self.save_downloaded_video_info(create_time_str, filename, video_url, aweme_id, title, desc, tags)
async def run(self):
await self.download_new_video_list_by_min_date(self.min_date)
return
async def start(self):
while True:
try:
await self.run()
logger.info(f"{self.flag}新视频检测... 刷新间隔 {self.check_interval}s")
await asyncio.sleep(self.check_interval)
except Exception as error:
logger.exception(f"{self.flag}新视频检测错误\n{repr(error)}")
await asyncio.sleep(self.check_interval)
continue
async def main():
dy_downloader = DouyinDownloader(
platform="douyin",
user_name="douyin",
min_date="2025-06-10",
# sec_user_id="MS4wLjABAAAAQinRMLyQNYA45OYXoCDrwszhRGaDVirRE1fTNSaGGkc",
sec_user_id="MS4wLjABAAAATGoBrO7yiJ3q9go4fxq9JXjrnP1bFpdkgKckC1IpfXA_vrjSmL9ZtjmTju8ApwbT",
file_output_dir=(project_path / "data/video/douyin/陈杰森").as_posix(),
file_info_file=(project_path / "data/video/douyin/陈杰森/file_info.json").as_posix(),
)
# await dy_downloader.download_video_by_url(
# "temp.mp4",
# "https://www.douyin.com/aweme/v1/play/?video_id=v0200fg10000d0tqrhnog65ja591m5gg&line=0&file_id=0267d175119a4cf9b74bbd26ea8f88d3&sign=7a57b19cb3fd4cdfcee4ed345c7837dc&is_play_url=1&source=PackSourceEnum_PUBLISH"
# )
await dy_downloader.download_new_video_list_by_min_date(min_date="2025-07-03 00:00:00")
return
if __name__ == "__main__":
asyncio.run(main())