|
|
@@ -14,8 +14,8 @@ import httpx
|
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
|
|
target = 'kaizty'
|
|
|
-step = 1 # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2
|
|
|
-local_proxy = 0
|
|
|
+step = 4 # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试
|
|
|
+local_proxy = 1
|
|
|
title_selector = '#pack-view__inner > section.pack-view__header > h1' # 获取标题选择器
|
|
|
img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img' # 获取图片的url
|
|
|
img_count_selector = '#pack-view__inner > section.pack-view__header > p' # 获取图片总数选择器
|
|
|
@@ -36,7 +36,7 @@ def open_browser(target_urls):
|
|
|
all_data = {}
|
|
|
|
|
|
for target_url in target_urls:
|
|
|
- pages = '/{}'
|
|
|
+ pages = '?page={}'
|
|
|
urls = []
|
|
|
title = '' # 存放当前页面的title
|
|
|
with sync_playwright() as playwright:
|
|
|
@@ -52,7 +52,8 @@ def open_browser(target_urls):
|
|
|
page = context.new_page()
|
|
|
|
|
|
img_sequence_num = 1
|
|
|
- for page_count in range(1, 999):
|
|
|
+
|
|
|
+ for page_count in range(1, 2):
|
|
|
# 检查一下当前页面是不是 404
|
|
|
try:
|
|
|
page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
|
|
|
@@ -68,46 +69,31 @@ def open_browser(target_urls):
|
|
|
print(e)
|
|
|
print(f'页面加载失败:url:{goto_url}')
|
|
|
|
|
|
- if page_count == 1:
|
|
|
- # 获取title
|
|
|
- page.wait_for_selector(title_selector, state="attached", timeout=10000)
|
|
|
- title = page.query_selector(title_selector).inner_text()
|
|
|
-
|
|
|
- img_count = page.query_selector(img_count_selector).inner_text()
|
|
|
- img_count = int(img_count.split(' ')[0])
|
|
|
-
|
|
|
- invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', ' ', 'Icon Pack ']
|
|
|
- for char in invalid_chars:
|
|
|
- title = title.replace(char, '')
|
|
|
-
|
|
|
- for i in range(1, img_count + 1):
|
|
|
- # 选择所有的<a>标签
|
|
|
- elements = page.query_selector_all(img_selector.format(i))
|
|
|
-
|
|
|
- # 遍历所有<a>标签,提取href属性
|
|
|
- for element in elements:
|
|
|
- src = element.get_attribute('src')
|
|
|
- if src:
|
|
|
- src = src.replace('/128/', '/512/')
|
|
|
- suffix = src.split('.')[-1]
|
|
|
- sequence = str(img_sequence_num).zfill(3)
|
|
|
- urls.append({
|
|
|
- 'url': src,
|
|
|
- 'file_title': title,
|
|
|
- 'serial': sequence,
|
|
|
- 'img': f'{title}_{sequence}',
|
|
|
- 'suffix': suffix
|
|
|
- })
|
|
|
- img_sequence_num += 1
|
|
|
- break
|
|
|
-
|
|
|
- print(f'所有图片URL已获取。总共图片 {len(urls)}')
|
|
|
+ page.wait_for_load_state('domcontentloaded')
|
|
|
|
|
|
- page.close()
|
|
|
- browser.close()
|
|
|
+ title = page.title()
|
|
|
+ page_source = page.content()
|
|
|
+ img_list = re.findall('<meta property="og:image" content="(.*?)"', page_source)
|
|
|
+
|
|
|
+ title = clean_string(title)
|
|
|
+
|
|
|
+ for img_url in img_list:
|
|
|
+ suffix = img_url.split('.')[-1]
|
|
|
+ sequence = str(img_sequence_num).zfill(3)
|
|
|
+ urls.append({
|
|
|
+ 'url': img_url,
|
|
|
+ 'file_title': title,
|
|
|
+ 'serial': sequence,
|
|
|
+ 'img': f'{title}_{sequence}',
|
|
|
+ 'suffix': suffix
|
|
|
+ })
|
|
|
+ img_sequence_num += 1
|
|
|
|
|
|
all_data[title] = urls
|
|
|
|
|
|
+ page.close()
|
|
|
+ browser.close()
|
|
|
+
|
|
|
# 获取所有 url 数据之后, 存数据库
|
|
|
return all_data
|
|
|
|
|
|
@@ -340,5 +326,16 @@ if __name__ == "__main__":
|
|
|
# 保存 img 链接
|
|
|
all_data = open_browser(targets)
|
|
|
save_data(all_data)
|
|
|
+
|
|
|
+ # 开始读取数据
|
|
|
+ load_data = load_data()
|
|
|
+
|
|
|
+ # 开始下载 img
|
|
|
+ target_file_path = check_local_downloads_dir()
|
|
|
+ download_img(load_data, target_file_path)
|
|
|
+ print('下载完成, 程序退出')
|
|
|
+ elif step == 4:
|
|
|
+ # 调试
|
|
|
+ all_data = open_browser(targets)
|
|
|
else:
|
|
|
pass
|