# -*- coding: utf-8 -*- import os import time import re import httpx from playwright.sync_api import sync_playwright url_photos = '/photos/' base_url = 'https://www.kaizty.com//photos/L2lBQ200aE0vOVNmUGcydzhhT296Zz09.html?page={}' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} def clean_string(string): string = string.replace('Kaizty Photos: ', '') string = string.split('|')[0] string = re.sub(r'[^\u4e00-\u9fff a-zA-Z0-9]', '', string) string = string.replace(' ', '_') if string.endswith('_'): string = string[:-1] return string def each_page(page, photo_url, folder): img_suffix = photo_url.split('.')[-1] img_name = str(int(time.time())) + '.' + img_suffix img_content = page.goto(photo_url).body() with open(os.path.join(folder, img_name), 'wb') as f: f.write(img_content) time.sleep(2) def run(playwright): browser = playwright.webkit.launch(headless=True) context = browser.new_context() page = context.new_page() for page_num in range(1, 20): page.goto(base_url.format(page_num)) title = page.title() # folder = clean_string(title) folder = 'aaa' if not os.path.exists(folder): print(f'new folder {folder}') os.makedirs(folder) page_source = page.content() photos_list = re.findall(' div.page-navigation > a.next'): print('no next page') break context.close() browser.close() with sync_playwright() as playwright: run(playwright)