Spaces:
Paused
Paused
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: | |
# 1. 不得用于任何商业用途。 | |
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 | |
# 3. 不得进行大规模爬取或对平台造成运营干扰。 | |
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 | |
# 5. 不得用于任何非法或不当的用途。 | |
# | |
# 详细许可条款请参阅项目根目录下的LICENSE文件。 | |
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 | |
import asyncio | |
import sys | |
from typing import Optional | |
import cmd_arg | |
import config | |
import db | |
from base.base_crawler import AbstractCrawler | |
from media_platform.bilibili import BilibiliCrawler | |
from media_platform.douyin import DouYinCrawler | |
from media_platform.kuaishou import KuaishouCrawler | |
from media_platform.tieba import TieBaCrawler | |
from media_platform.weibo import WeiboCrawler | |
from media_platform.xhs import XiaoHongShuCrawler | |
from media_platform.zhihu import ZhihuCrawler | |
class CrawlerFactory: | |
CRAWLERS = { | |
"xhs": XiaoHongShuCrawler, | |
"dy": DouYinCrawler, | |
"ks": KuaishouCrawler, | |
"bili": BilibiliCrawler, | |
"wb": WeiboCrawler, | |
"tieba": TieBaCrawler, | |
"zhihu": ZhihuCrawler | |
} | |
def create_crawler(platform: str) -> AbstractCrawler: | |
crawler_class = CrawlerFactory.CRAWLERS.get(platform) | |
if not crawler_class: | |
raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...") | |
return crawler_class() | |
async def main(): | |
# parse cmd | |
await cmd_arg.parse_cmd() | |
# init db | |
if config.SAVE_DATA_OPTION == "db": | |
await db.init_db() | |
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM) | |
await crawler.start() | |
if config.SAVE_DATA_OPTION == "db": | |
await db.close() | |
if __name__ == '__main__': | |
try: | |
# asyncio.run(main()) | |
asyncio.get_event_loop().run_until_complete(main()) | |
except KeyboardInterrupt: | |
sys.exit() | |