1 jaar geleden · 2f82916079
--- a/.gitignore
+++ b/.gitignore
@@ -1,60 +1,4 @@
 
				-# ---> Python
			
 
				-# Byte-compiled / optimized / DLL files
			
 
				+.DS_Store
			
 
				 __pycache__/
			
 
				-*.py[cod]
			
 
				-*$py.class
			
 
				-
			
 
				-# C extensions
			
 
				-*.so
			
 
				-
			
 
				-# Distribution / packaging
			
 
				-.Python
			
 
				-env/
			
 
				-build/
			
 
				-develop-eggs/
			
 
				-dist/
			
 
				-downloads/
			
 
				-eggs/
			
 
				-.eggs/
			
 
				-lib/
			
 
				-lib64/
			
 
				-parts/
			
 
				-sdist/
			
 
				-var/
			
 
				-*.egg-info/
			
 
				-.installed.cfg
			
 
				-*.egg
			
 
				-
			
 
				-# PyInstaller
			
 
				-#  Usually these files are written by a python script from a template
			
 
				-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
			
 
				-*.manifest
			
 
				-*.spec
			
 
				-
			
 
				-# Installer logs
			
 
				-pip-log.txt
			
 
				-pip-delete-this-directory.txt
			
 
				-
			
 
				-# Unit test / coverage reports
			
 
				-htmlcov/
			
 
				-.tox/
			
 
				-.coverage
			
 
				-.coverage.*
			
 
				-.cache
			
 
				-nosetests.xml
			
 
				-coverage.xml
			
 
				-*,cover
			
 
				-
			
 
				-# Translations
			
 
				-*.mo
			
 
				-*.pot
			
 
				-
			
 
				-# Django stuff:
			
 
				-*.log
			
 
				-
			
 
				-# Sphinx documentation
			
 
				-docs/_build/
			
 
				-
			
 
				-# PyBuilder
			
 
				-target/
			
 
				-
			
 
				+*.pyc
			
 
				+.idea
			
--- a/kaizty_spider.py
+++ b/kaizty_spider.py
@@ -0,0 +1,105 @@
 
				+import os.path
			
 
				+import re
			
 
				+import random
			
 
				+import time
			
 
				+
			
 
				+import httpx
			
 
				+
			
 
				+url_key = 'UHh0dkRPOWwyV2R2V0ZFU3hMRFZaZz09.html?'
			
 
				+url_photos = '/photos/'
			
 
				+base_url = 'https://www.kaizty.com/'
			
 
				+url_page = 'page={}'
			
 
				+
			
 
				+headers = {
			
 
				+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def get_pages():
			
 
				+    title = ''
			
 
				+    all_img_list = []
			
 
				+
			
 
				+    error_times = 0
			
 
				+    max_error_times = 2
			
 
				+    page = 1
			
 
				+
			
 
				+    while True:
			
 
				+        if error_times >= max_error_times:
			
 
				+            break
			
 
				+
			
 
				+        print('正在获取第 {} 页数据'.format(page))
			
 
				+        url = base_url + url_photos + url_key + url_page.format(page)
			
 
				+        page += 1
			
 
				+
			
 
				+        response = httpx.get(url, headers=headers)
			
 
				+        response.encoding = 'utf-8'
			
 
				+        html = response.text
			
 
				+        target_block = re.findall('<\!\[endif\]--><title>(.*?)<meta property="og:locale"', html)
			
 
				+        if not target_block:
			
 
				+            continue
			
 
				+        target_block = target_block[0]
			
 
				+        if not title:
			
 
				+            re_title = re.findall('(.*?)\| Page', target_block)
			
 
				+            if not re_title:
			
 
				+                print('获取 title 失败')
			
 
				+                error_times += 1
			
 
				+                continue
			
 
				+            re_title = re_title[0]
			
 
				+            title = re.sub(r'[<>:"/\\|?*]', '', re_title)
			
 
				+            title = title.replace(' ', '')
			
 
				+
			
 
				+        img_list = re.findall('<meta itemprop="image" content="(.*?)"', target_block)
			
 
				+        if not img_list:
			
 
				+            print('获取图片链接失败, 第{}页'.format(page))
			
 
				+            error_times += 1
			
 
				+            continue
			
 
				+        all_img_list += img_list
			
 
				+        # time.sleep(random.uniform(2, 3))
			
 
				+
			
 
				+    return all_img_list, title
			
 
				+
			
 
				+
			
 
				+def get_imgs(all_img_list, title):
			
 
				+    print('\n\n开始保存图片')
			
 
				+
			
 
				+    current_directory = os.getcwd()
			
 
				+
			
 
				+    if not os.path.exists(title):
			
 
				+        os.mkdir(title)
			
 
				+
			
 
				+    img_dir = os.path.join(current_directory, title)
			
 
				+    files = [f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))]
			
 
				+
			
 
				+    now_last_num = 1
			
 
				+    if files:
			
 
				+        now_last_num = int(files[-1].split('.')[0])
			
 
				+
			
 
				+    for n in range(now_last_num, len(all_img_list)):
			
 
				+        img = httpx.get(all_img_list[n], headers=headers)
			
 
				+
			
 
				+        if not img.status_code == 200:
			
 
				+            print('请求图片错误, 程序退出')
			
 
				+            raise Exception(f'状态码 {img.status_code}')
			
 
				+
			
 
				+        file_name = f"{n:04d}" + "." + all_img_list[n].split(".")[-1]
			
 
				+        print('正在保存图片: {}'.format(file_name))
			
 
				+        with open(title + "/" + file_name, "wb") as f:
			
 
				+            f.write(img.content)
			
 
				+            # time.sleep(random.uniform(8, 10))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    all_img_list, title = get_pages()
			
 
				+
			
 
				+    while True:
			
 
				+        try:
			
 
				+            get_imgs(all_img_list, title)
			
 
				+        except Exception as e:
			
 
				+            print(e)
			
 
				+            time.sleep(random.uniform(30, 40))
			
 
				+            continue
			
 
				+        else:
			
 
				+            print("程序执行完成，退出循环")
			
 
				+            break
			
 
				+
			
 
				+print("done")