# -*-coding: utf-8 -*- ''' 在批量执行 rss 爬虫时, 获取数据后会生成 rss 文件,无需执行此代码, 如需手动生成 rss 文件,执行此代码即可 ''' import os import threading import PyRSS2Gen import time import pymongo import tools_load_config config_json = tools_load_config.load_config() base_project = tools_load_config.get_base_path() PROJECT_NAME = config_json.get('PROJECT_NAME') DB_USER = config_json.get('DB_USER') DB_PASSWORD = config_json.get('DB_PASSWORD') DB_IP = config_json.get('DB_IP') DB_PORT = config_json.get('DB_PORT') MONGO_LINK = f'mongodb://{DB_USER}:{DB_PASSWORD}@{DB_IP}:{DB_PORT}/' MAIL_HOST = config_json.get('MAIL_HOST') MAIL_USER = config_json.get('MAIL_USER') MAIL_PASS = config_json.get('MAIL_PASS') MAIL_SENDER = config_json.get('MAIL_SENDER') MAIL_RECEIVERS = config_json.get('MAIL_RECEIVERS') now_day = time.strftime('%Y-%m-%d', time.localtime()) rss_base_url = 'http://home.erhe.link:20002/xmlfile/' base_project = os.path.join(os.getcwd().split(PROJECT_NAME)[0], PROJECT_NAME) def string_mapping(string): mapping_dict = { '掌设': 'handheld', '汽车': 'car', '评测': 'testing', '美食': 'food', '电脑': 'pc', '视听': 'audiovisual', '腕表': 'watch', '单车': 'bicycle', '摄影': 'photograph', '家居': 'home' } for key in mapping_dict: if key in string: string = string.replace(key, mapping_dict[key]) return string def handle_data(db_and_collection): db = db_and_collection['db'] collection = db_and_collection['collection'] title = db.replace('RSS_', '') print(f'读取 {db} -- {collection} 数据') client = pymongo.MongoClient(MONGO_LINK) _get_db = client[db] _get_collection = _get_db[collection] gen_file_name = f'{title}_{collection}_rss.xml' # 替换中英文 gen_file_name = string_mapping(gen_file_name) link = db_and_collection['link'] description = db_and_collection['source_type'] lastBuildDate = now_day items = [] xml_file = os.path.join(base_project, 'news', 'rss_xmlfile') path = os.path.join(xml_file, gen_file_name) xml_url_file = os.path.join(xml_file, 'rss_url.txt') # TODO # load mongodb # save to item for data in _get_collection.find(): t = data.setdefault('title') d = data.setdefault('context') l = data.setdefault('source_url') item = PyRSS2Gen.RSSItem( title=t, link=l, description=d, pubDate=data.setdefault('posted_date'), ) items.append(item) gen2rss(gen_file_name, title, link, description, lastBuildDate, items, path) with open(xml_url_file, 'a') as f: f.write(rss_base_url + gen_file_name + '\n\n') def gen2rss(gen_file_name, title, link, description, lastBuildDate, items, path): rss = PyRSS2Gen.RSS2( title=title, link=link, description=description, lastBuildDate=lastBuildDate, items=items) print('正在生成rss文件: 路径: %s, 文件名: %s' % (path, gen_file_name)) rss.write_xml(open(path, "w", encoding='utf-8'), encoding='utf-8') def run(): xml_file = os.path.join(os.getcwd().split(PROJECT_NAME)[0], PROJECT_NAME, 'news', 'rss_xmlfile') if not os.path.exists(xml_file): os.mkdir(xml_file) rss_url_txt = os.path.join(xml_file, 'rss_url.txt') if os.path.exists(rss_url_txt): os.remove(rss_url_txt) # 每一个列表, 0数据库名, 1爬虫目标网站url, 2目标网站名称 dbs = [ ['RSS_HelloGithub', 'https://hellogithub.com/', 'HelloGithub'], ['RSS_apprcn', 'https://free.apprcn.com/', '反斗限免'], ['RSS_news', 'https://www.anyknew.com/', '聚合新闻'], ['RSS_chiphell', 'https://www.chiphell.com/', 'chiphell'] ] db_and_collections = [] for db in dbs: client = pymongo.MongoClient(MONGO_LINK) if db[0] not in client.list_database_names(): print(f'找不到数据库 {db[0]}') continue _get_db = client[db[0]] for collection in _get_db.list_collection_names(): db_and_collections.append({ 'db': db[0], 'collection': collection, 'link': db[1], 'source_type': db[2] }) # for d in db_and_collections: # handle_data(d) # 进程列表 threads = [] # 使用 for 循环批量创建线程 for i in db_and_collections: thread = threading.Thread(target=handle_data, args=(i,)) threads.append(thread) thread.start() # 等待所有线程执行完成 for thread in threads: thread.join() run()