|
@@ -14,7 +14,7 @@ import httpx
|
|
|
from playwright.sync_api import sync_playwright
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
|
|
|
|
target = 'kaizty'
|
|
target = 'kaizty'
|
|
|
-step = 4 # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试
|
|
|
|
|
|
|
+step = 1 # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试
|
|
|
local_proxy = 1
|
|
local_proxy = 1
|
|
|
title_selector = '#pack-view__inner > section.pack-view__header > h1' # 获取标题选择器
|
|
title_selector = '#pack-view__inner > section.pack-view__header > h1' # 获取标题选择器
|
|
|
img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img' # 获取图片的url
|
|
img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img' # 获取图片的url
|
|
@@ -53,26 +53,19 @@ def open_browser(target_urls):
|
|
|
|
|
|
|
|
img_sequence_num = 1
|
|
img_sequence_num = 1
|
|
|
|
|
|
|
|
- for page_count in range(1, 2):
|
|
|
|
|
- # 检查一下当前页面是不是 404
|
|
|
|
|
- try:
|
|
|
|
|
- page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
|
|
|
|
|
- print(f'总页数是 {page_count - 1} 在 url: {goto_url}')
|
|
|
|
|
- break
|
|
|
|
|
- except:
|
|
|
|
|
- pass
|
|
|
|
|
-
|
|
|
|
|
|
|
+ for page_count in range(30, 31):
|
|
|
try:
|
|
try:
|
|
|
goto_url = target_url + pages.format(page_count)
|
|
goto_url = target_url + pages.format(page_count)
|
|
|
page.goto(goto_url, timeout=5000)
|
|
page.goto(goto_url, timeout=5000)
|
|
|
|
|
+ page.wait_for_load_state('domcontentloaded')
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
print(e)
|
|
print(e)
|
|
|
print(f'页面加载失败:url:{goto_url}')
|
|
print(f'页面加载失败:url:{goto_url}')
|
|
|
|
|
|
|
|
- page.wait_for_load_state('domcontentloaded')
|
|
|
|
|
|
|
+ page_source = page.content()
|
|
|
|
|
|
|
|
title = page.title()
|
|
title = page.title()
|
|
|
- page_source = page.content()
|
|
|
|
|
|
|
+
|
|
|
img_list = re.findall('<meta property="og:image" content="(.*?)"', page_source)
|
|
img_list = re.findall('<meta property="og:image" content="(.*?)"', page_source)
|
|
|
|
|
|
|
|
title = clean_string(title)
|
|
title = clean_string(title)
|
|
@@ -89,11 +82,12 @@ def open_browser(target_urls):
|
|
|
})
|
|
})
|
|
|
img_sequence_num += 1
|
|
img_sequence_num += 1
|
|
|
|
|
|
|
|
- all_data[title] = urls
|
|
|
|
|
-
|
|
|
|
|
page.close()
|
|
page.close()
|
|
|
browser.close()
|
|
browser.close()
|
|
|
|
|
|
|
|
|
|
+ if urls:
|
|
|
|
|
+ all_data[title] = urls
|
|
|
|
|
+
|
|
|
# 获取所有 url 数据之后, 存数据库
|
|
# 获取所有 url 数据之后, 存数据库
|
|
|
return all_data
|
|
return all_data
|
|
|
|
|
|
|
@@ -336,6 +330,6 @@ if __name__ == "__main__":
|
|
|
print('下载完成, 程序退出')
|
|
print('下载完成, 程序退出')
|
|
|
elif step == 4:
|
|
elif step == 4:
|
|
|
# 调试
|
|
# 调试
|
|
|
- all_data = open_browser(targets)
|
|
|
|
|
|
|
+ pass
|
|
|
else:
|
|
else:
|
|
|
pass
|
|
pass
|