kaizty.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. # -*- coding: utf-8 -*-
  2. # 共两个步骤, 1, 将目标图片的信息拉到数据库(标题, 所有img的url), 2, 从数据库中读取对应目标站点的所有未下载过的img的url, 下载到本地
  3. # 需要安装psql, 并且 CREATE DATABASE collect; 运行会自动建表
  4. import sys
  5. import os
  6. import time
  7. import random
  8. import re
  9. import psycopg2
  10. sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
  11. import httpx
  12. from playwright.sync_api import sync_playwright
  13. target = 'kaizty'
  14. # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试
  15. step = 2
  16. local_proxy = 0
  17. title_selector = '#pack-view__inner > section.pack-view__header > h1' # 获取标题选择器
  18. img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img' # 获取图片的url
  19. img_count_selector = '#pack-view__inner > section.pack-view__header > p' # 获取图片总数选择器
  20. not_find_page_selector = 'body > div.page-navigation > a.next' # 当无法获取下一页时, 此选择器为最后一页
  21. project_root = os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection')
  22. psql_params = {
  23. "host": "home.erhe.link",
  24. "port": 55434,
  25. "user": "psql",
  26. "password": "psql",
  27. "dbname": "collect"
  28. }
  29. def open_browser(target_urls):
  30. all_data = {}
  31. for target_url in target_urls:
  32. pages = '?page={}'
  33. urls = []
  34. title = '' # 存放当前页面的title
  35. with sync_playwright() as playwright:
  36. if local_proxy:
  37. browser = playwright.chromium.launch(
  38. headless=True,
  39. proxy={"server": "http://127.0.0.1:7890"}
  40. )
  41. else:
  42. browser = playwright.chromium.launch(headless=True)
  43. context = browser.new_context(viewport={'width': 1280, 'height': 700})
  44. page = context.new_page()
  45. img_sequence_num = 1
  46. for page_count in range(1, 999):
  47. try:
  48. goto_url = target_url + pages.format(page_count)
  49. page.goto(goto_url, timeout=20000)
  50. page.wait_for_selector('body > div.housing > div.housing-coveringap > div.thrcol.refill.afsite > div.thr-ot.hid > div > div.c-content > div:nth-child(3) > div')
  51. except Exception as e:
  52. print(e)
  53. print(f'页面加载失败:url:{goto_url}')
  54. page_source = page.content()
  55. if "EMPTY" in page_source:
  56. print('没有下一页了, 跳出循环')
  57. break
  58. print(f'开始获取第 {page_count} 页')
  59. title = page.title()
  60. img_list = re.findall('<meta property="og:image" content="(.*?)"', page_source)
  61. title = clean_string(title)
  62. for img_url in img_list:
  63. suffix = img_url.split('.')[-1]
  64. sequence = str(img_sequence_num).zfill(3)
  65. urls.append({
  66. 'url': img_url,
  67. 'file_title': title,
  68. 'serial': sequence,
  69. 'img': f'{title}_{sequence}',
  70. 'suffix': suffix
  71. })
  72. img_sequence_num += 1
  73. page.close()
  74. browser.close()
  75. if urls:
  76. all_data[title] = urls
  77. # 获取所有 url 数据之后, 存数据库
  78. return all_data
  79. def download_img(load_data, target_file_path):
  80. # 连接数据库, 准备反写下载状态
  81. conn = psycopg2.connect(**psql_params)
  82. cursor = conn.cursor()
  83. print('正在下载图片')
  84. for data in load_data:
  85. # 如果img文件存在, 即已经下载过, 直接跳过
  86. id = data['id']
  87. name = data['name']
  88. target_site = data['target_site'],
  89. file_title = data['file_title'].replace(' ', '_')
  90. set_name = data['set_name']
  91. serial = str(data['serial']).zfill(3)
  92. image_suffix = data['image_suffix']
  93. img_url = data['img_url']
  94. # 查看每个合集的文件夹是否存在, 不存在就创建
  95. title_file_path = os.path.join(target_file_path, file_title)
  96. if not os.path.exists(title_file_path):
  97. os.mkdir(title_file_path)
  98. img_name = f'{file_title}_{serial}.{image_suffix}' # 图片文件名
  99. img_file_path = os.path.join(str(title_file_path), img_name) # 图片完整路径
  100. if os.path.exists(img_file_path):
  101. # 当此 img 已存在本地时, 在 psql 将数据库状态改为已下载
  102. query = f"UPDATE {target} SET download_state = %s WHERE id = %s"
  103. cursor.execute(query, (True, id))
  104. conn.commit()
  105. print(f'图片 {img_file_path} 已存在。继续!')
  106. continue
  107. retry = 8
  108. while retry:
  109. try:
  110. resp = httpx.get(img_url, headers={
  111. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  112. "Accept-Encoding": "gzip, deflate, br, zstd",
  113. "Accept-Language": "zh-CN,zh;q=0.9",
  114. "Cache-Control": "max-age=0",
  115. "Cookie": "asgfp2=77542c163334cb6fe4f6c38c671acfdd; _ga=GA1.1.1971075315.1723678888; _ga_WF05TQ75CR=GS1.1.1726202265.4.1.1726202301.24.0.0; asgfp2=77542c163334cb6fe4f6c38c671acfdd; sp-chjeuHenj=Po",
  116. "Priority": "u=0, i",
  117. "Referer": "https://www.kaizty.com/photos/bFh6Njdrc01HM0FxeEhrVFVXM2xlUT09.html?page=9",
  118. "Sec-CH-UA": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
  119. "Sec-CH-UA-Mobile": "?0",
  120. "Sec-CH-UA-Platform": "\"Windows\"",
  121. "Sec-Fetch-Dest": "document",
  122. "Sec-Fetch-Mode": "navigate",
  123. "Sec-Fetch-Site": "same-origin",
  124. "Sec-Fetch-User": "?1",
  125. "Upgrade-Insecure-Requests": "1",
  126. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
  127. })
  128. with open(img_file_path, 'wb') as f:
  129. f.write(resp.content)
  130. # 下载成功后, 在 psql 将数据库状态改为已下载
  131. query = f"UPDATE {target} SET download_state = %s WHERE id = %s"
  132. cursor.execute(query, (True, id))
  133. conn.commit()
  134. print(f'已下载:{img_name}')
  135. time.sleep(random.uniform(1, 2))
  136. break
  137. except Exception as e:
  138. print(f'下载图片失败:{img_name}。错误:{e} 重试: {retry}')
  139. retry -= 1
  140. time.sleep(random.uniform(3, 5))
  141. def save_data(data_item):
  142. conn = psycopg2.connect(**psql_params)
  143. cursor = conn.cursor()
  144. for k, v in data_item.items():
  145. for data in v:
  146. # 检查img_url是否重复
  147. cursor.execute(f"SELECT img_url FROM {target} WHERE img_url = %s", (data['url'],))
  148. if cursor.fetchone() is None:
  149. # 插入数据
  150. cursor.execute(("""
  151. INSERT INTO {target} (name, target_site, file_title, set_name, serial, download_state, image_suffix, img_url)
  152. VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
  153. """).format(target=target), (
  154. None,
  155. target,
  156. data['file_title'],
  157. None,
  158. data['serial'],
  159. False,
  160. data['suffix'],
  161. data['url']
  162. ))
  163. conn.commit()
  164. print(f"数据 {data['url']} 插入成功")
  165. else:
  166. print(f"数据 {data['url']} 已存在,未插入")
  167. # 关闭数据库连接
  168. cursor.close()
  169. conn.close()
  170. def load_data():
  171. # 连接数据库
  172. conn = psycopg2.connect(**psql_params)
  173. cursor = conn.cursor()
  174. # 查询download_state为false的所有数据
  175. query = f"SELECT * FROM {target} WHERE download_state = %s order by id asc"
  176. load_data_list = []
  177. try:
  178. # 执行查询
  179. cursor.execute(query, (False,))
  180. # 获取查询结果
  181. rows = cursor.fetchall()
  182. # 打印结果
  183. for row in rows:
  184. load_data_list.append(
  185. {
  186. 'id': row[0],
  187. 'name': row[1],
  188. 'target_site': row[2],
  189. 'file_title': row[3],
  190. 'set_name': row[4],
  191. 'serial': row[5],
  192. 'download_state': row[6],
  193. 'image_suffix': row[7],
  194. 'img_url': row[8]
  195. }
  196. )
  197. except psycopg2.Error as e:
  198. print(f"Database error: {e}")
  199. finally:
  200. # 关闭数据库连接
  201. cursor.close()
  202. conn.close()
  203. if load_data_list:
  204. return load_data_list
  205. else:
  206. print("没有需要下载的数据。")
  207. exit(0)
  208. def check_psql():
  209. # 连接数据库
  210. try:
  211. conn = psycopg2.connect(**psql_params)
  212. except Exception as e:
  213. print(f"无法连接到数据库:{e}")
  214. exit(1)
  215. # 创建cursor对象
  216. cur = conn.cursor()
  217. cur.execute("SELECT EXISTS(SELECT FROM pg_catalog.pg_tables WHERE schemaname = 'public' AND tablename = %s)",
  218. (target,))
  219. exist = cur.fetchone()[0]
  220. if not exist:
  221. # 如果不存在,则创建表
  222. cur.execute(f"""
  223. CREATE TABLE {target} (
  224. id SERIAL PRIMARY KEY,
  225. name VARCHAR(255),
  226. target_site VARCHAR(255),
  227. file_title VARCHAR(255),
  228. set_name VARCHAR(255),
  229. serial INT,
  230. download_state BOOLEAN,
  231. image_suffix VARCHAR(50),
  232. img_url TEXT
  233. );
  234. """)
  235. print(f"表 '{target}' 创建成功。")
  236. # 提交事务
  237. conn.commit()
  238. # 关闭cursor和连接
  239. cur.close()
  240. conn.close()
  241. def check_local_downloads_dir():
  242. # 查看一下是否存在 downloads 文件夹, 不存在就创建一个
  243. download_file_path = os.path.join(str(project_root), 'downloads')
  244. if not os.path.exists(download_file_path):
  245. os.mkdir(download_file_path)
  246. target_file_path = os.path.join(download_file_path, target)
  247. if not os.path.exists(target_file_path):
  248. os.mkdir(target_file_path)
  249. return target_file_path
  250. def clean_string(string):
  251. string = string.replace('Kaizty Photos: ', '')
  252. string = string.split('|')[0]
  253. string = re.sub(r'[^\u4e00-\u9fff a-zA-Z0-9]', '', string)
  254. string = string.replace(' ', '_')
  255. if string.endswith('_'):
  256. string = string[:-1]
  257. return string
  258. if __name__ == "__main__":
  259. # 检查数据库
  260. check_psql()
  261. txt_file_name = 'target_link.txt'
  262. if not os.path.exists(txt_file_name):
  263. with open(txt_file_name, 'w') as file:
  264. file.write('')
  265. print('需要在 target_link.txt 中填写目标链接')
  266. exit(0)
  267. else:
  268. with open('target_link.txt', 'r') as f:
  269. targets = [target.strip() for target in f.readlines()]
  270. if not targets:
  271. print('在 target_link.txt 中未找到目标链接')
  272. exit(0)
  273. print(f'目标链接是:{targets}')
  274. if step == 1:
  275. all_data = open_browser(targets)
  276. save_data(all_data)
  277. elif step == 2:
  278. # 开始读取数据
  279. load_data = load_data()
  280. # 开始下载 img
  281. target_file_path = check_local_downloads_dir()
  282. download_img(load_data, target_file_path)
  283. print('下载完成, 程序退出')
  284. elif step == 3:
  285. # 保存 img 链接
  286. all_data = open_browser(targets)
  287. save_data(all_data)
  288. # 开始读取数据
  289. load_data = load_data()
  290. # 开始下载 img
  291. target_file_path = check_local_downloads_dir()
  292. download_img(load_data, target_file_path)
  293. print('下载完成, 程序退出')
  294. elif step == 4:
  295. # 调试
  296. pass
  297. else:
  298. pass