jack 1 jaar geleden
bovenliggende
commit
2f82916079
2 gewijzigde bestanden met toevoegingen van 108 en 59 verwijderingen
  1. 3 59
      .gitignore
  2. 105 0
      kaizty_spider.py

+ 3 - 59
.gitignore

@@ -1,60 +1,4 @@
-# ---> Python
-# Byte-compiled / optimized / DLL files
+.DS_Store
 __pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-env/
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-*.egg-info/
-.installed.cfg
-*.egg
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*,cover
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
+*.pyc
+.idea

+ 105 - 0
kaizty_spider.py

@@ -0,0 +1,105 @@
+import os.path
+import re
+import random
+import time
+
+import httpx
+
+url_key = 'UHh0dkRPOWwyV2R2V0ZFU3hMRFZaZz09.html?'
+url_photos = '/photos/'
+base_url = 'https://www.kaizty.com/'
+url_page = 'page={}'
+
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
+}
+
+
+def get_pages():
+    title = ''
+    all_img_list = []
+
+    error_times = 0
+    max_error_times = 2
+    page = 1
+
+    while True:
+        if error_times >= max_error_times:
+            break
+
+        print('正在获取第 {} 页数据'.format(page))
+        url = base_url + url_photos + url_key + url_page.format(page)
+        page += 1
+
+        response = httpx.get(url, headers=headers)
+        response.encoding = 'utf-8'
+        html = response.text
+        target_block = re.findall('<\!\[endif\]--><title>(.*?)<meta property="og:locale"', html)
+        if not target_block:
+            continue
+        target_block = target_block[0]
+        if not title:
+            re_title = re.findall('(.*?)\| Page', target_block)
+            if not re_title:
+                print('获取 title 失败')
+                error_times += 1
+                continue
+            re_title = re_title[0]
+            title = re.sub(r'[<>:"/\\|?*]', '', re_title)
+            title = title.replace(' ', '')
+
+        img_list = re.findall('<meta itemprop="image" content="(.*?)"', target_block)
+        if not img_list:
+            print('获取图片链接失败, 第{}页'.format(page))
+            error_times += 1
+            continue
+        all_img_list += img_list
+        # time.sleep(random.uniform(2, 3))
+
+    return all_img_list, title
+
+
+def get_imgs(all_img_list, title):
+    print('\n\n开始保存图片')
+
+    current_directory = os.getcwd()
+
+    if not os.path.exists(title):
+        os.mkdir(title)
+
+    img_dir = os.path.join(current_directory, title)
+    files = [f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))]
+
+    now_last_num = 1
+    if files:
+        now_last_num = int(files[-1].split('.')[0])
+
+    for n in range(now_last_num, len(all_img_list)):
+        img = httpx.get(all_img_list[n], headers=headers)
+
+        if not img.status_code == 200:
+            print('请求图片错误, 程序退出')
+            raise Exception(f'状态码 {img.status_code}')
+
+        file_name = f"{n:04d}" + "." + all_img_list[n].split(".")[-1]
+        print('正在保存图片: {}'.format(file_name))
+        with open(title + "/" + file_name, "wb") as f:
+            f.write(img.content)
+            # time.sleep(random.uniform(8, 10))
+
+
+if __name__ == '__main__':
+    all_img_list, title = get_pages()
+
+    while True:
+        try:
+            get_imgs(all_img_list, title)
+        except Exception as e:
+            print(e)
+            time.sleep(random.uniform(30, 40))
+            continue
+        else:
+            print("程序执行完成,退出循环")
+            break
+
+print("done")