kaizty.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. # -*- coding: utf-8 -*-
  2. # 共两个步骤, 1, 将目标图片的信息拉到数据库(标题, 所有img的url), 2, 从数据库中读取对应目标站点的所有未下载过的img的url, 下载到本地
  3. # 需要安装psql, 并且 CREATE DATABASE collect; 运行会自动建表
  4. import sys
  5. import os
  6. import time
  7. import random
  8. import re
  9. import psycopg2
  10. sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
  11. import httpx
  12. from playwright.sync_api import sync_playwright
  13. target = 'kaizty'
  14. step = 1 # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2
  15. local_proxy = 0
  16. title_selector = '#pack-view__inner > section.pack-view__header > h1' # 获取标题选择器
  17. img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img' # 获取图片的url
  18. img_count_selector = '#pack-view__inner > section.pack-view__header > p' # 获取图片总数选择器
  19. not_find_page_selector = 'body > div.page-navigation > a.next' # 当无法获取下一页时, 此选择器为最后一页
  20. project_root = os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection')
  21. psql_params = {
  22. "host": "home.erhe.link",
  23. "port": 55434,
  24. "user": "psql",
  25. "password": "psql",
  26. "dbname": "collect"
  27. }
  28. def open_browser(target_urls):
  29. all_data = {}
  30. for target_url in target_urls:
  31. pages = '/{}'
  32. urls = []
  33. title = '' # 存放当前页面的title
  34. with sync_playwright() as playwright:
  35. if local_proxy:
  36. browser = playwright.chromium.launch(
  37. headless=True,
  38. proxy={"server": "http://127.0.0.1:7890"}
  39. )
  40. else:
  41. browser = playwright.chromium.launch(headless=True)
  42. context = browser.new_context(viewport={'width': 1280, 'height': 700})
  43. page = context.new_page()
  44. img_sequence_num = 1
  45. for page_count in range(1, 999):
  46. # 检查一下当前页面是不是 404
  47. try:
  48. page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
  49. print(f'总页数是 {page_count - 1} 在 url: {goto_url}')
  50. break
  51. except:
  52. pass
  53. try:
  54. goto_url = target_url + pages.format(page_count)
  55. page.goto(goto_url, timeout=5000)
  56. except Exception as e:
  57. print(e)
  58. print(f'页面加载失败:url:{goto_url}')
  59. if page_count == 1:
  60. # 获取title
  61. page.wait_for_selector(title_selector, state="attached", timeout=10000)
  62. title = page.query_selector(title_selector).inner_text()
  63. img_count = page.query_selector(img_count_selector).inner_text()
  64. img_count = int(img_count.split(' ')[0])
  65. invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', ' ', 'Icon Pack ']
  66. for char in invalid_chars:
  67. title = title.replace(char, '')
  68. for i in range(1, img_count + 1):
  69. # 选择所有的<a>标签
  70. elements = page.query_selector_all(img_selector.format(i))
  71. # 遍历所有<a>标签,提取href属性
  72. for element in elements:
  73. src = element.get_attribute('src')
  74. if src:
  75. src = src.replace('/128/', '/512/')
  76. suffix = src.split('.')[-1]
  77. sequence = str(img_sequence_num).zfill(3)
  78. urls.append({
  79. 'url': src,
  80. 'file_title': title,
  81. 'serial': sequence,
  82. 'img': f'{title}_{sequence}',
  83. 'suffix': suffix
  84. })
  85. img_sequence_num += 1
  86. break
  87. print(f'所有图片URL已获取。总共图片 {len(urls)}')
  88. page.close()
  89. browser.close()
  90. all_data[title] = urls
  91. # 获取所有 url 数据之后, 存数据库
  92. return all_data
  93. def download_img(load_data, target_file_path):
  94. # 连接数据库, 准备反写下载状态
  95. conn = psycopg2.connect(**psql_params)
  96. cursor = conn.cursor()
  97. print('正在下载图片')
  98. for data in load_data:
  99. # 如果img文件存在, 即已经下载过, 直接跳过
  100. id = data['id']
  101. name = data['name']
  102. target_site = data['target_site'],
  103. file_title = data['file_title'].replace(' ', '_')
  104. set_name = data['set_name']
  105. serial = str(data['serial']).zfill(3)
  106. image_suffix = data['image_suffix']
  107. img_url = data['img_url']
  108. # 查看每个合集的文件夹是否存在, 不存在就创建
  109. title_file_path = os.path.join(target_file_path, file_title)
  110. if not os.path.exists(title_file_path):
  111. os.mkdir(title_file_path)
  112. img_name = f'{file_title}_{serial}.{image_suffix}' # 图片文件名
  113. img_file_path = os.path.join(str(title_file_path), img_name) # 图片完整路径
  114. if os.path.exists(img_file_path):
  115. # 当此 img 已存在本地时, 在 psql 将数据库状态改为已下载
  116. query = f"UPDATE {target} SET download_state = %s WHERE id = %s"
  117. cursor.execute(query, (True, id))
  118. conn.commit()
  119. print(f'图片 {img_file_path} 已存在。继续!')
  120. continue
  121. retry = 8
  122. while retry:
  123. try:
  124. resp = httpx.get(img_url, headers={
  125. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
  126. })
  127. with open(img_file_path, 'wb') as f:
  128. f.write(resp.content)
  129. # 下载成功后, 在 psql 将数据库状态改为已下载
  130. query = f"UPDATE {target} SET download_state = %s WHERE id = %s"
  131. cursor.execute(query, (True, id))
  132. conn.commit()
  133. print(f'已下载:{img_name}')
  134. time.sleep(random.uniform(1, 2))
  135. break
  136. except Exception as e:
  137. print(f'下载图片失败:{img_name}。错误:{e} 重试: {retry}')
  138. retry -= 1
  139. time.sleep(random.uniform(3, 5))
  140. def save_data(data_item):
  141. conn = psycopg2.connect(**psql_params)
  142. cursor = conn.cursor()
  143. for k, v in data_item.items():
  144. for data in v:
  145. # 检查img_url是否重复
  146. cursor.execute("SELECT img_url FROM flaticon WHERE img_url = %s", (data['url'],))
  147. if cursor.fetchone() is None:
  148. # 插入数据
  149. cursor.execute("""
  150. INSERT INTO flaticon (name, target_site, file_title, set_name, serial, download_state, image_suffix, img_url)
  151. VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
  152. """, (
  153. None,
  154. target,
  155. data['file_title'],
  156. None,
  157. data['serial'],
  158. False,
  159. data['suffix'],
  160. data['url']
  161. ))
  162. conn.commit()
  163. print(f"数据 {data['url']} 插入成功")
  164. else:
  165. print(f"数据 {data['url']} 已存在,未插入")
  166. # 关闭数据库连接
  167. cursor.close()
  168. conn.close()
  169. def load_data():
  170. # 连接数据库
  171. conn = psycopg2.connect(**psql_params)
  172. cursor = conn.cursor()
  173. # 查询download_state为false的所有数据
  174. query = f"SELECT * FROM {target} WHERE download_state = %s order by id asc"
  175. load_data_list = []
  176. try:
  177. # 执行查询
  178. cursor.execute(query, (False,))
  179. # 获取查询结果
  180. rows = cursor.fetchall()
  181. # 打印结果
  182. for row in rows:
  183. load_data_list.append(
  184. {
  185. 'id': row[0],
  186. 'name': row[1],
  187. 'target_site': row[2],
  188. 'file_title': row[3],
  189. 'set_name': row[4],
  190. 'serial': row[5],
  191. 'download_state': row[6],
  192. 'image_suffix': row[7],
  193. 'img_url': row[8]
  194. }
  195. )
  196. except psycopg2.Error as e:
  197. print(f"Database error: {e}")
  198. finally:
  199. # 关闭数据库连接
  200. cursor.close()
  201. conn.close()
  202. if load_data_list:
  203. return load_data_list
  204. else:
  205. print("没有需要下载的数据。")
  206. exit(0)
  207. def check_psql():
  208. # 连接数据库
  209. try:
  210. conn = psycopg2.connect(**psql_params)
  211. except Exception as e:
  212. print(f"无法连接到数据库:{e}")
  213. exit(1)
  214. # 创建cursor对象
  215. cur = conn.cursor()
  216. cur.execute("SELECT EXISTS(SELECT FROM pg_catalog.pg_tables WHERE schemaname = 'public' AND tablename = %s)",
  217. (target,))
  218. exist = cur.fetchone()[0]
  219. if not exist:
  220. # 如果不存在,则创建表
  221. cur.execute(f"""
  222. CREATE TABLE {target} (
  223. id SERIAL PRIMARY KEY,
  224. name VARCHAR(255),
  225. target_site VARCHAR(255),
  226. file_title VARCHAR(255),
  227. set_name VARCHAR(255),
  228. serial INT,
  229. download_state BOOLEAN,
  230. image_suffix VARCHAR(50),
  231. img_url VARCHAR(255)
  232. );
  233. """)
  234. print(f"表 '{target}' 创建成功。")
  235. # 提交事务
  236. conn.commit()
  237. # 关闭cursor和连接
  238. cur.close()
  239. conn.close()
  240. def check_local_downloads_dir():
  241. # 查看一下是否存在 downloads 文件夹, 不存在就创建一个
  242. download_file_path = os.path.join(str(project_root), 'downloads')
  243. if not os.path.exists(download_file_path):
  244. os.mkdir(download_file_path)
  245. target_file_path = os.path.join(download_file_path, target)
  246. if not os.path.exists(target_file_path):
  247. os.mkdir(target_file_path)
  248. return target_file_path
  249. def clean_string(string):
  250. string = string.replace('Kaizty Photos: ', '')
  251. string = string.split('|')[0]
  252. string = re.sub(r'[^\u4e00-\u9fff a-zA-Z0-9]', '', string)
  253. string = string.replace(' ', '_')
  254. if string.endswith('_'):
  255. string = string[:-1]
  256. return string
  257. if __name__ == "__main__":
  258. # 检查数据库
  259. check_psql()
  260. txt_file_name = 'target_link.txt'
  261. if not os.path.exists(txt_file_name):
  262. with open(txt_file_name, 'w') as file:
  263. file.write('')
  264. print('需要在 target_link.txt 中填写目标链接')
  265. exit(0)
  266. else:
  267. with open('target_link.txt', 'r') as f:
  268. targets = [target.strip() for target in f.readlines()]
  269. if not targets:
  270. print('在 target_link.txt 中未找到目标链接')
  271. exit(0)
  272. print(f'目标链接是:{targets}')
  273. if step == 1:
  274. all_data = open_browser(targets)
  275. save_data(all_data)
  276. elif step == 2:
  277. # 开始读取数据
  278. load_data = load_data()
  279. # 开始下载 img
  280. target_file_path = check_local_downloads_dir()
  281. download_img(load_data, target_file_path)
  282. print('下载完成, 程序退出')
  283. elif step == 3:
  284. # 保存 img 链接
  285. all_data = open_browser(targets)
  286. save_data(all_data)
  287. else:
  288. pass