|
@@ -14,8 +14,10 @@ import httpx
|
|
|
from playwright.sync_api import sync_playwright
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
|
|
|
|
target = 'kaizty'
|
|
target = 'kaizty'
|
|
|
-step = 1 # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试
|
|
|
|
|
-local_proxy = 1
|
|
|
|
|
|
|
+
|
|
|
|
|
+# 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试
|
|
|
|
|
+step = 2
|
|
|
|
|
+local_proxy = 0
|
|
|
title_selector = '#pack-view__inner > section.pack-view__header > h1' # 获取标题选择器
|
|
title_selector = '#pack-view__inner > section.pack-view__header > h1' # 获取标题选择器
|
|
|
img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img' # 获取图片的url
|
|
img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img' # 获取图片的url
|
|
|
img_count_selector = '#pack-view__inner > section.pack-view__header > p' # 获取图片总数选择器
|
|
img_count_selector = '#pack-view__inner > section.pack-view__header > p' # 获取图片总数选择器
|
|
@@ -53,17 +55,23 @@ def open_browser(target_urls):
|
|
|
|
|
|
|
|
img_sequence_num = 1
|
|
img_sequence_num = 1
|
|
|
|
|
|
|
|
- for page_count in range(30, 31):
|
|
|
|
|
|
|
+ for page_count in range(1, 999):
|
|
|
try:
|
|
try:
|
|
|
goto_url = target_url + pages.format(page_count)
|
|
goto_url = target_url + pages.format(page_count)
|
|
|
- page.goto(goto_url, timeout=5000)
|
|
|
|
|
- page.wait_for_load_state('domcontentloaded')
|
|
|
|
|
|
|
+ page.goto(goto_url, timeout=20000)
|
|
|
|
|
+ page.wait_for_selector('body > div.housing > div.housing-coveringap > div.thrcol.refill.afsite > div.thr-ot.hid > div > div.c-content > div:nth-child(3) > div')
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
print(e)
|
|
print(e)
|
|
|
print(f'页面加载失败:url:{goto_url}')
|
|
print(f'页面加载失败:url:{goto_url}')
|
|
|
|
|
|
|
|
page_source = page.content()
|
|
page_source = page.content()
|
|
|
|
|
|
|
|
|
|
+ if "EMPTY" in page_source:
|
|
|
|
|
+ print('没有下一页了, 跳出循环')
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ print(f'开始获取第 {page_count} 页')
|
|
|
|
|
+
|
|
|
title = page.title()
|
|
title = page.title()
|
|
|
|
|
|
|
|
img_list = re.findall('<meta property="og:image" content="(.*?)"', page_source)
|
|
img_list = re.findall('<meta property="og:image" content="(.*?)"', page_source)
|
|
@@ -129,7 +137,22 @@ def download_img(load_data, target_file_path):
|
|
|
while retry:
|
|
while retry:
|
|
|
try:
|
|
try:
|
|
|
resp = httpx.get(img_url, headers={
|
|
resp = httpx.get(img_url, headers={
|
|
|
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
|
|
|
|
|
|
|
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
|
|
|
+ "Accept-Encoding": "gzip, deflate, br, zstd",
|
|
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9",
|
|
|
|
|
+ "Cache-Control": "max-age=0",
|
|
|
|
|
+ "Cookie": "asgfp2=77542c163334cb6fe4f6c38c671acfdd; _ga=GA1.1.1971075315.1723678888; _ga_WF05TQ75CR=GS1.1.1726202265.4.1.1726202301.24.0.0; asgfp2=77542c163334cb6fe4f6c38c671acfdd; sp-chjeuHenj=Po",
|
|
|
|
|
+ "Priority": "u=0, i",
|
|
|
|
|
+ "Referer": "https://www.kaizty.com/photos/bFh6Njdrc01HM0FxeEhrVFVXM2xlUT09.html?page=9",
|
|
|
|
|
+ "Sec-CH-UA": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
|
|
|
|
|
+ "Sec-CH-UA-Mobile": "?0",
|
|
|
|
|
+ "Sec-CH-UA-Platform": "\"Windows\"",
|
|
|
|
|
+ "Sec-Fetch-Dest": "document",
|
|
|
|
|
+ "Sec-Fetch-Mode": "navigate",
|
|
|
|
|
+ "Sec-Fetch-Site": "same-origin",
|
|
|
|
|
+ "Sec-Fetch-User": "?1",
|
|
|
|
|
+ "Upgrade-Insecure-Requests": "1",
|
|
|
|
|
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
|
|
})
|
|
})
|
|
|
with open(img_file_path, 'wb') as f:
|
|
with open(img_file_path, 'wb') as f:
|
|
|
f.write(resp.content)
|
|
f.write(resp.content)
|
|
@@ -154,13 +177,13 @@ def save_data(data_item):
|
|
|
for k, v in data_item.items():
|
|
for k, v in data_item.items():
|
|
|
for data in v:
|
|
for data in v:
|
|
|
# 检查img_url是否重复
|
|
# 检查img_url是否重复
|
|
|
- cursor.execute("SELECT img_url FROM flaticon WHERE img_url = %s", (data['url'],))
|
|
|
|
|
|
|
+ cursor.execute(f"SELECT img_url FROM {target} WHERE img_url = %s", (data['url'],))
|
|
|
if cursor.fetchone() is None:
|
|
if cursor.fetchone() is None:
|
|
|
# 插入数据
|
|
# 插入数据
|
|
|
- cursor.execute("""
|
|
|
|
|
- INSERT INTO flaticon (name, target_site, file_title, set_name, serial, download_state, image_suffix, img_url)
|
|
|
|
|
|
|
+ cursor.execute(("""
|
|
|
|
|
+ INSERT INTO {target} (name, target_site, file_title, set_name, serial, download_state, image_suffix, img_url)
|
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
|
|
- """, (
|
|
|
|
|
|
|
+ """).format(target=target), (
|
|
|
None,
|
|
None,
|
|
|
target,
|
|
target,
|
|
|
data['file_title'],
|
|
data['file_title'],
|
|
@@ -252,7 +275,7 @@ def check_psql():
|
|
|
serial INT,
|
|
serial INT,
|
|
|
download_state BOOLEAN,
|
|
download_state BOOLEAN,
|
|
|
image_suffix VARCHAR(50),
|
|
image_suffix VARCHAR(50),
|
|
|
- img_url VARCHAR(255)
|
|
|
|
|
|
|
+ img_url TEXT
|
|
|
);
|
|
);
|
|
|
""")
|
|
""")
|
|
|
print(f"表 '{target}' 创建成功。")
|
|
print(f"表 '{target}' 创建成功。")
|