jack 1 vuosi sitten
vanhempi
sitoutus
fa49daf463

+ 3 - 3
config.json

@@ -1,5 +1,5 @@
 {
-  "PROJECT_NAME": "auto",
+  "PROJECT_NAME": "auto_news_scheduler",
   "MAIL_HOST": "smtp.163.com",
   "MAIL_USER": "pushmessagebot@163.com",
   "MAIL_PASS": "WSMSRKBKXIHIQWTU",
@@ -7,8 +7,8 @@
   "MAIL_RECEIVERS": "pushmessagebot@163.com",
   "DB_USER": "root",
   "DB_PASSWORD": "aaaAAA111!!!",
-  "DB_IP": "192.168.100.146",
-  "DB_PORT": "38001",
+  "DB_IP": "127.0.0.1",
+  "DB_PORT": "38000",
   "USE_PROXY": true,
   "PROXY_HOST": "192.168.100.146",
   "PROXY_PORT": 7890

+ 39 - 15
main.py

@@ -5,36 +5,60 @@
 scheduler.add_job(midnight_task, 'cron', hour=0, minute=0) # 每天定时执行
 scheduler.add_job(test_error, 'interval', seconds=2) # 循环间隔多少秒执行
 scheduler.add_job(weekly_task, 'cron', day_of_week='mon,wed,sat', hour=22, minute=30) # 添加定时任务,设置为每周一、三、六晚上10点30分执行
+scheduler.add_job(lambda: apprcn.main(), 'cron', hour='0-23', minute=0) # 每天的每小时整点执行一次
 '''
 from apscheduler.schedulers.background import BackgroundScheduler
 import time
 
+# 日志类
+import utils_daily_logs_generate
+import utils_daily_logs_send
+import utils_timing_remove_data
 
-def hello_world():
-    print("Hello World")
+# 新闻类爬虫
+import news_get_apprcn
+import news_get_chiphell
+import news_get_hello_github
+import news_get_news
+import news_get_36kr_info
+import news_get_36kr_key
 
+# 大乐透
+import spider_get_and_check_dlt
 
-def hello_kitty():
-    print("Hello Kitty")
+# 发送新闻汇总邮件
+import utils_news_data_collation
 
+# 创建 BackgroundScheduler 实例
+scheduler = BackgroundScheduler()
 
-def test_error():
-    try:
-        a = 1 / 0
-    except ZeroDivisionError:
-        print("Division by zero")
+# 每天 0 点创建日志文件
+scheduler.add_job(lambda: utils_daily_logs_generate.LogsHandle().logs_generate(), 'cron', hour=0, minute=0)
 
+# 每天 23.59 分发送当天日志
+scheduler.add_job(lambda: utils_daily_logs_send.LogsHandle().logs_send(), 'cron', hour=23, minute=59)
 
-# 创建 BackgroundScheduler 实例
-scheduler = BackgroundScheduler()
+# 每周1,3,6 晚上 10 点 30 分执行 dlt
+scheduler.add_job(lambda: spider_get_and_check_dlt.Luanch().main(), 'cron', day_of_week='mon,wed,sat', hour=22,
+                  minute=30)
+
+# 创建任务对象
+
+
+# 执行多个新闻爬取的任务
+scheduler.add_job(lambda: news_get_apprcn.APPRCN().main(), 'cron', hour='3,6,9,13,15,18,21', minute=0)
+scheduler.add_job(lambda: news_get_chiphell.CHIPHELL().main(), 'cron', hour='3,6,9,13,15,18,21', minute=0)
+scheduler.add_job(lambda: news_get_hello_github.HelloGithub().main(), 'cron', hour='3,6,9,13,15,18,21', minute=0)
+scheduler.add_job(lambda: news_get_news.HotNews().main(), 'cron', hour='3,6,9,13,15,18,21', minute=0)
+scheduler.add_job(lambda: news_get_36kr_info.Get36krInfo().main(), 'cron', hour='3,6,9,13,15,18,21', minute=0)
+scheduler.add_job(lambda: news_get_36kr_key.Get36krKey().main(), 'cron', hour='3,6,9,13,15,18,21', minute=0)
 
-# 添加定时任务
-scheduler.add_job(hello_world, 'interval', seconds=10)
-scheduler.add_job(hello_kitty, 'interval', seconds=15)
-scheduler.add_job(test_error, 'interval', seconds=2)
+# 发送新闻汇总邮件的任务
+scheduler.add_job(lambda: utils_news_data_collation.NewsDataCollation().main(), 'cron', hour='8,12,19,23', minute=0)
 
 # 启动调度器
 scheduler.start()
+print('\n定时任务开始执行')
 
 # 为了防止程序退出,这里使用一个无限循环
 try:

+ 9 - 7
news_get_36kr_info.py

@@ -32,6 +32,7 @@ class Get36krInfo:
         self.db = 'NEWS'
         self.collection = '36kr_info'
         self.send_email_datas = {}
+        self.send_email_now = 0
 
     def req(self):
         result_data = []
@@ -145,10 +146,11 @@ class Get36krInfo:
         if result_data:
             new_datas = self.save_to_mongo(result_data)
 
-            if new_datas:
-                self.send_to_email(new_datas)
-            else:
-                print('无新数据')
+            if self.send_email_now:
+                if new_datas:
+                    self.send_to_email(new_datas)
+                else:
+                    print('无新数据')
 
             self.logs_handle.logs_write('36kr - info', '36kr - info 数据获取完成', 'done', False)
             print('done')
@@ -156,6 +158,6 @@ class Get36krInfo:
             self.logs_handle.logs_write('36kr - info', '无法获取 36kr - info 数据', 'error', False)
 
 
-if __name__ == '__main__':
-    g = Get36krInfo()
-    g.main()
+# if __name__ == '__main__':
+#     g = Get36krInfo()
+#     g.main()

+ 9 - 7
news_get_36kr_key.py

@@ -28,6 +28,7 @@ class Get36krKey:
         self.db = 'NEWS'
         self.collection = '36kr_key'
         self.send_email_datas = {}
+        self.send_email_now = 0
 
     def req(self):
         result_data = []
@@ -141,10 +142,11 @@ class Get36krKey:
         if result_data:
             new_datas = self.save_to_mongo(result_data)
 
-            if new_datas:
-                self.send_to_email(new_datas)
-            else:
-                print('无新数据')
+            if self.send_email_now:
+                if new_datas:
+                    self.send_to_email(new_datas)
+                else:
+                    print('无新数据')
 
             self.logs_handle.logs_write('36kr - key', '36kr - key 数据获取完成', 'done', False)
             print('done')
@@ -152,6 +154,6 @@ class Get36krKey:
             self.logs_handle.logs_write('36kr - key', '无法获取 36kr - key 数据', 'error', False)
 
 
-if __name__ == '__main__':
-    g = Get36krKey()
-    g.main()
+# if __name__ == '__main__':
+#     g = Get36krKey()
+#     g.main()

+ 10 - 7
news_get_apprcn.py

@@ -26,6 +26,7 @@ class APPRCN(object):
         collection = 'apprcn_info'
         self.mongo = MongoHandle(db=db, collection=collection, del_db=False, del_collection=False, auto_remove=0)
         self.send_email_datas = []
+        self.send_email_now = 0
 
     def main(self):
         self.logs_handle.logs_write('apprcn', '开始获取反斗限免数据', 'start', False)
@@ -35,7 +36,8 @@ class APPRCN(object):
         if response_data:
             self.save_to_mongo(response_data)
 
-            self.send_to_email()
+            if self.send_email_now:
+                self.send_to_email()
 
             self.logs_handle.logs_write('apprcn', '反斗限免数据获取完成', 'done', False)
             print('done')
@@ -69,7 +71,8 @@ class APPRCN(object):
             post_date_list = re.findall('<time>(.*?)</time>', content)
             source_data_list = re.findall('<a class="cat" href="(.*?)"', content)
 
-            for title, context, post_date, source_data in zip(title_list, context_list, post_date_list, source_data_list):
+            for title, context, post_date, source_data in zip(title_list, context_list, post_date_list,
+                                                              source_data_list):
                 response_data.append({
                     "title": title,
                     "context": context,
@@ -111,14 +114,14 @@ class APPRCN(object):
         if self.send_email_datas:
             text = ''
             for data in self.send_email_datas:
-                text += '标题: %s\n内容: %s\n时间: %s\n链接: %s\n\n' % (data['title'], data['context'], data['posted_date'], data['source_url'])
+                text += '标题: %s\n内容: %s\n时间: %s\n链接: %s\n\n' % (
+                data['title'], data['context'], data['posted_date'], data['source_url'])
             send_email = SendEmail(subject='反斗限免', title='反斗限免', text=text)
             send_email.send()
             self.logs_handle.logs_write('apprcn', '发送邮件完成', 'done', False)
         else:
             self.logs_handle.logs_write('apprcn', '没有新数据, 不发送邮件', 'done', False)
 
-
-if __name__ == "__main__":
-    A = APPRCN()
-    A.main()
+# if __name__ == "__main__":
+#     A = APPRCN()
+#     A.main()

+ 10 - 8
news_get_chiphell.py

@@ -7,7 +7,7 @@ import random
 import sys
 import threading
 
-sys.path.append(os.path.join(os.getcwd().split('auto')[0], 'auto'))
+sys.path.append(os.path.join(os.getcwd().split('auto_news_scheduler')[0], 'auto_news_scheduler'))
 import re
 import time
 from datetime import datetime
@@ -29,6 +29,7 @@ class CHIPHELL(object):
             'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8'
         }
         self.send_email_datas = []
+        self.send_email_now = 0
 
     def req(self, source, target):
         print(f'正在获取 {source} 数据')
@@ -55,7 +56,8 @@ class CHIPHELL(object):
                     context_list = re.findall('class="tn" /></a></div>([\S\s]*?)</dd>', dl)
                     post_time_list = re.findall('<span class="xg1"> (.*?)</span>', dl)
 
-                    for url, title, img_url, context, post_time in zip(url_list, title_list, img_url_list, context_list, post_time_list):
+                    for url, title, img_url, context, post_time in zip(url_list, title_list, img_url_list, context_list,
+                                                                       post_time_list):
                         # 清理正文内容的空格和换行等字符
                         if context:
                             for i in [' ', '\n']:
@@ -224,10 +226,10 @@ class CHIPHELL(object):
             return False
 
         # 如果 self.send_email_datas 中有数据, 则发送邮件
-        if self.send_email_datas:
-            self.send_to_email()
+        if self.send_email_now:
+            if self.send_email_datas:
+                self.send_to_email()
 
-
-if __name__ == '__main__':
-    C = CHIPHELL()
-    C.main()
+# if __name__ == '__main__':
+#     C = CHIPHELL()
+#     C.main()

+ 16 - 10
news_get_hello_github.py

@@ -5,7 +5,7 @@ Hello Github
 import os
 import sys
 
-sys.path.append(os.path.join(os.getcwd().split('auto')[0], 'auto'))
+sys.path.append(os.path.join(os.getcwd().split('auto_news_scheduler')[0], 'auto_news_scheduler'))
 import threading
 import time
 from datetime import datetime
@@ -27,11 +27,12 @@ class HelloGithub(object):
         self.collection = 'HelloGithub_info'
         self.source_url = 'https://hellogithub.com/repository/'
         self.send_email_datas = []
+        self.send_email_now = 0
 
     def main(self):
         self.logs_handle.logs_write('HelloGithub', '开始获取 HelloGithub 数据', 'start', False)
 
-        targets = ['last', 'hot']
+        targets = ['featured']
 
         response_datas = []
 
@@ -48,10 +49,11 @@ class HelloGithub(object):
         self.logs_handle.logs_write('HelloGithub', 'HelloGithub 数据获取完成', 'done', False)
         print('获取 HelloGithub 数据 done')
 
-        if self.send_email_datas:
-            self.send_to_email()
-        else:
-            print('没有新数据, 不发送邮件')
+        if self.send_email_now:
+            if self.send_email_datas:
+                self.send_to_email()
+            else:
+                print('没有新数据, 不发送邮件')
 
     def req(self, target):
         print('开始获取 HelloGithub {} 数据'.format(target))
@@ -60,6 +62,9 @@ class HelloGithub(object):
             response = httpx.get(url='https://api.hellogithub.com/v1/?sort_by={}&tid=&page={}'.format(target, i),
                                  headers=self.headers)
             if response.status_code != 200:
+                print(
+                    '获取 HelloGithub {} 数据, 状态码: {}, 程序退出\n检查目标地址: https://api.hellogithub.com/v1/?sort_by={}&tid=&page={}'.format(
+                        target, response.status_code, target, i))
                 self.logs_handle.logs_write('HelloGithub', '请求失败, 状态码: %s' % response.status_code, 'error',
                                             False)
                 exit(0)
@@ -88,7 +93,8 @@ class HelloGithub(object):
     def save_to_mongo(self, data):
         print(f'开始储存 HelloGithub 数据')
         for data_to_insert in data:
-            mongo = MongoHandle(db=self.db, collection=self.collection, del_db=False, del_collection=False, auto_remove=0)
+            mongo = MongoHandle(db=self.db, collection=self.collection, del_db=False, del_collection=False,
+                                auto_remove=0)
 
             try:
                 # 检查数据库中是否存在匹配的文档
@@ -125,6 +131,6 @@ class HelloGithub(object):
         self.logs_handle.logs_write('HelloGithub', f'{title}-发送邮件完成', 'done', False)
 
 
-if __name__ == "__main__":
-    H = HelloGithub()
-    H.main()
+# if __name__ == "__main__":
+#     H = HelloGithub()
+#     H.main()

+ 9 - 7
news_get_news.py

@@ -26,6 +26,7 @@ class HotNews():
             'binary': 'https://www.anyknew.com/api/v1/cats/binary'
         }
         self.send_email_datas = []
+        self.send_email_now = 0
 
     def main(self):
         self.logs_handle.logs_write('聚合新闻', '任务开始', 'start', False)
@@ -35,11 +36,12 @@ class HotNews():
         if resp_data:
             self.save_to_mongo(resp_data)
 
-            if self.send_email_datas:
-                print('准备发送邮件')
-                self.send_to_email()
-            else:
-                print('无新数据')
+            if self.send_email_now:
+                if self.send_email_datas:
+                    print('准备发送邮件')
+                    self.send_to_email()
+                else:
+                    print('无新数据')
 
         else:
             self.logs_handle.logs_write('聚合新闻', '获取数据为空', 'error', False)
@@ -137,5 +139,5 @@ class HotNews():
         print('邮件已发送')
 
 
-if __name__ == '__main__':
-    HotNews().main()
+# if __name__ == '__main__':
+#     HotNews().main()

+ 4 - 4
spider_get_and_check_dlt.py

@@ -2,7 +2,7 @@
 import os
 import sys
 
-sys.path.append(os.path.join(os.getcwd().split('auto')[0], 'auto'))
+sys.path.append(os.path.join(os.getcwd().split('auto_news_scheduler')[0], 'auto_news_scheduler'))
 import threading
 from datetime import datetime
 import time
@@ -284,9 +284,9 @@ class Luanch(object):
             Logs.logs_write('auto_get_and_check_dlt', '获取数据失败', 'error', False)
 
 
-if __name__ == '__main__':
-    L = Luanch()
-    L.main()
+# if __name__ == '__main__':
+#     L = Luanch()
+#     L.main()
 
     # ## 单独获取数据
     # G = GetData()

+ 1 - 1
spider_get_free_ip_proxy.py

@@ -5,7 +5,7 @@
 import os
 import sys
 
-sys.path.append(os.path.join(os.getcwd().split('auto')[0], 'auto'))
+sys.path.append(os.path.join(os.getcwd().split('auto_news_scheduler')[0], 'auto_news_scheduler'))
 from datetime import datetime
 import time
 import re

+ 7 - 7
spider_get_one_week_weather.py

@@ -5,7 +5,7 @@
 import os
 import sys
 
-sys.path.append(os.path.join(os.getcwd().split('auto')[0], 'auto'))
+sys.path.append(os.path.join(os.getcwd().split('auto_news_scheduler')[0], 'auto_news_scheduler'))
 import time
 from datetime import datetime
 import httpx
@@ -64,9 +64,9 @@ class Weather():
         self.logs_handle.logs_write('Weather forecast', '天气预报数据已获取', 'done', False)
 
 
-if __name__ == "__main__":
-    L = LogsHandle()
-    L.logs_write('Weather forecast', '开始获取天气预报数据', 'start', False)
-    W = Weather()
-    W.main()
-    L.logs_write('Weather forecast', '天气预报数据已获取', 'done', False)
+# if __name__ == "__main__":
+#     L = LogsHandle()
+#     L.logs_write('Weather forecast', '开始获取天气预报数据', 'start', False)
+#     W = Weather()
+#     W.main()
+#     L.logs_write('Weather forecast', '天气预报数据已获取', 'done', False)

+ 3 - 3
tools_load_config.py

@@ -11,8 +11,8 @@ import sys
 
 def load_config():
     try:
-        sys.path.append(os.path.join(os.getcwd().split('auto')[0], 'auto'))
-        base_project = os.path.join(os.getcwd().split('auto')[0], 'auto')
+        sys.path.append(os.path.join(os.getcwd().split('auto_news_scheduler')[0], 'auto_news_scheduler'))
+        base_project = os.path.join(os.getcwd().split('auto_news_scheduler')[0], 'auto_news_scheduler')
 
         config_path = os.path.join(base_project, 'config.json')
         config_json = {}
@@ -30,4 +30,4 @@ def load_config():
 
 
 def get_base_path():
-    return os.path.join(os.getcwd().split('auto')[0], 'auto')
+    return os.path.join(os.getcwd().split('auto_news_scheduler')[0], 'auto_news_scheduler')

+ 11 - 12
utils_daily_logs_send.py

@@ -26,7 +26,6 @@ MAIL_SENDER = config_json.get('MAIL_SENDER')
 MAIL_RECEIVERS = config_json.get('MAIL_RECEIVERS')
 
 now_day = time.strftime('%Y-%m-%d', time.localtime())
-rss_base_url = 'http://home.erhe.link:20002/xmlfile/'
 
 
 class LogsHandle(object):
@@ -47,11 +46,12 @@ class LogsHandle(object):
         cursor = self.mongo.collection.find()
         # 遍历结果集
         for record in cursor:
-            text += "logs_source: {}, logs_detail: {}, state: {} logs_create_time: {}\n\n".format(record.setdefault('title'),
-                                                                                                  record.setdefault('content'),
-                                                                                                  record.setdefault('state'),
-                                                                                                  record.setdefault('create_datetime'),
-                                                                                                  )
+            text += "logs_source: {}, logs_detail: {}, state: {} logs_create_time: {}\n\n".format(
+                record.setdefault('title'),
+                record.setdefault('content'),
+                record.setdefault('state'),
+                record.setdefault('create_datetime'),
+            )
         S = SendEmail(subject=subject, title=title, text=text)
         S.send()
 
@@ -104,7 +104,7 @@ class SendEmail(object):
     def send(self):
         message = MIMEText(self.text, 'plain', 'utf-8')
         message['From'] = Header(self.title, 'utf-8')
-        message['To'] = Header("auto collection", 'utf-8')
+        message['To'] = Header("auto", 'utf-8')
 
         subject = self.subject
         message['Subject'] = Header(subject, 'utf-8')
@@ -118,8 +118,7 @@ class SendEmail(object):
         except smtplib.SMTPException:
             print("Error: 无法发送邮件")
 
-
-if __name__ == '__main__':
-    print("发送当天日志:start")
-    LogsHandle().logs_send()
-    print("发送当天日志:done")
+# if __name__ == '__main__':
+#     print("发送当天日志:start")
+#     LogsHandle().logs_send()
+#     print("发送当天日志:done")

+ 5 - 5
utils_timing_remove_data.py

@@ -12,8 +12,8 @@ import smtplib
 from email.mime.text import MIMEText
 from email.header import Header
 
-sys.path.append(os.path.join(os.getcwd().split('auto')[0], 'auto'))
-base_project = os.path.join(os.getcwd().split('auto')[0], 'auto')
+sys.path.append(os.path.join(os.getcwd().split('auto_news_scheduler')[0], 'auto_news_scheduler'))
+base_project = os.path.join(os.getcwd().split('auto_news_scheduler')[0], 'auto_news_scheduler')
 import json
 
 config_path = os.path.join(base_project, 'config.json')
@@ -191,6 +191,6 @@ class AutoRemoveData(object):
         self.logs.logs_write(f'自动删除 {self.day} 天以上数据', f'自动删除 {self.day} 天数以上数据完成', 'done', False)
 
 
-if __name__ == "__main__":
-    A = AutoRemoveData()
-    A.main()
+# if __name__ == "__main__":
+#     A = AutoRemoveData()
+#     A.main()