jack 1 سال پیش
والد
کامیت
de8a376294
1فایلهای تغییر یافته به همراه69 افزوده شده و 0 حذف شده
  1. 69 0
      kaizty_playwerght.py

+ 69 - 0
kaizty_playwerght.py

@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+import os
+import time
+import re
+import httpx
+from playwright.sync_api import sync_playwright
+
+url_photos = '/photos/'
+base_url = 'https://www.kaizty.com//photos/L2lBQ200aE0vOVNmUGcydzhhT296Zz09.html?page={}'
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
+
+
+def clean_string(string):
+    string = string.replace('Kaizty Photos: ', '')
+    string = string.split('|')[0]
+    string = re.sub(r'[^\u4e00-\u9fff a-zA-Z0-9]', '', string)
+    string = string.replace(' ', '_')
+    if string.endswith('_'):
+        string = string[:-1]
+    return string
+
+
+def each_page(page, photo_url, folder):
+    img_suffix = photo_url.split('.')[-1]
+    img_name = str(int(time.time())) + '.' + img_suffix
+    img_content = page.goto(photo_url).body()
+    with open(os.path.join(folder, img_name), 'wb') as f:
+        f.write(img_content)
+    time.sleep(2)
+
+
+def run(playwright):
+    browser = playwright.webkit.launch(headless=True)
+
+    context = browser.new_context()
+
+    page = context.new_page()
+
+    for page_num in range(1, 20):
+        page.goto(base_url.format(page_num))
+
+        title = page.title()
+        # folder = clean_string(title)
+        folder = 'aaa'
+        if not os.path.exists(folder):
+            print(f'new folder {folder}')
+            os.makedirs(folder)
+
+        page_source = page.content()
+        photos_list = re.findall('<meta property="og:image" content="(.*?)"', page_source)
+
+        for photo_url in photos_list:
+            each_page(page, photo_url, folder)
+
+        # 延时一下
+        time.sleep(2)
+
+        # 没找到下一页, 就退出循环
+        if not page.query_selector('body > div.page-navigation > a.next'):
+            print('no next page')
+            break
+
+    context.close()
+    browser.close()
+
+
+with sync_playwright() as playwright:
+    run(playwright)