flaticon.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. # -*- coding: utf-8 -*-
  2. # 共两个步骤, 1, 将目标图片的信息拉到数据库(标题, 所有img的url), 2, 从数据库中读取对应目标站点的所有未下载过的img的url, 下载到本地
  3. # 需要安装psql, 并且 CREATE DATABASE collect; 运行会自动建表
  4. import socket
  5. import sys
  6. import os
  7. import time
  8. import random
  9. from concurrent.futures import ThreadPoolExecutor
  10. import psycopg2
  11. sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
  12. import httpx
  13. from playwright.sync_api import sync_playwright
  14. target = 'flaticon'
  15. step = 4 # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试
  16. local_proxy = 0
  17. thread_count = 8
  18. title_selector = '#pack-view__inner > section.pack-view__header > h1' # 获取标题选择器
  19. img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img' # 获取图片的url
  20. img_count_selector = '#pack-view__inner > section.pack-view__header > p' # 获取图片总数选择器
  21. not_find_page_selector = '#viewport > div.errorpage.e404 > h1' # 当无法获取下一页时, 此选择器为最后一页
  22. project_root = os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection')
  23. # 获取局域网ip, 如果不是局域网, 则用公网连接数据库
  24. s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  25. s.connect(('10.255.255.255', 1))
  26. IP = s.getsockname()[0]
  27. s.close()
  28. if '192.168.100' not in IP:
  29. psql_params = {
  30. "host": "home.erhe.link",
  31. "port": 55434,
  32. "user": "psql",
  33. "password": "psql",
  34. "dbname": "collect"
  35. }
  36. else:
  37. psql_params = {
  38. "host": "192.168.100.146",
  39. "port": 5434,
  40. "user": "psql",
  41. "password": "psql",
  42. "dbname": "collect"
  43. }
  44. def open_browser(target_urls):
  45. # all_data = {}
  46. link_count = 1
  47. for target_url in target_urls:
  48. print(f'\n开始获取 {target_url} 数据, 当前链接是第 {link_count} 个, 共 {len(target_urls)} 个链接')
  49. link_count += 1
  50. pages = '/{}'
  51. urls = []
  52. title = '' # 存放当前页面的title
  53. total_page_count = 0 # 合集总共页数
  54. with sync_playwright() as playwright:
  55. if local_proxy:
  56. browser = playwright.chromium.launch(
  57. headless=True,
  58. proxy={"server": "http://127.0.0.1:7890"}
  59. )
  60. else:
  61. browser = playwright.chromium.launch(headless=True)
  62. context = browser.new_context(viewport={'width': 1280, 'height': 700})
  63. page = context.new_page()
  64. img_sequence_num = 1
  65. for page_count in range(1, 999):
  66. try:
  67. goto_url = target_url + pages.format(page_count)
  68. page.goto(goto_url, timeout=8000)
  69. except Exception as e:
  70. pass
  71. # print(e)
  72. # print(f'页面加载失败:url:{goto_url}')
  73. if page_count == 1:
  74. # 获取title
  75. page.wait_for_selector(title_selector, state="attached", timeout=10000)
  76. title = page.query_selector(title_selector).inner_text()
  77. img_count = page.query_selector(img_count_selector).inner_text()
  78. img_count = int(img_count.split(' ')[0])
  79. invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', ' ', 'Icon Pack ']
  80. for char in invalid_chars:
  81. title = title.replace(char, '')
  82. else:
  83. try:
  84. # 检查一下当前页面是不是 404
  85. page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
  86. total_page_count = page_count - 1
  87. break
  88. except:
  89. pass
  90. for i in range(1, img_count + 1):
  91. # 选择所有的<a>标签
  92. elements = page.query_selector_all(img_selector.format(i))
  93. # 遍历所有<a>标签,提取href属性
  94. for element in elements:
  95. src = element.get_attribute('src')
  96. if src:
  97. src = src.replace('/128/', '/512/')
  98. suffix = src.split('.')[-1]
  99. sequence = str(img_sequence_num).zfill(3)
  100. urls.append({
  101. 'url': src,
  102. 'file_title': title,
  103. 'serial': sequence,
  104. 'img': f'{title}_{sequence}',
  105. 'suffix': suffix
  106. })
  107. img_sequence_num += 1
  108. break
  109. print(f'所有图片URL已获取。总页数: 共 {total_page_count} 页, 总共图片 {len(urls)}, 正在写入数据库...')
  110. page.close()
  111. browser.close()
  112. # all_data[title] = urls # 所有数据爬取完成再存
  113. save_data({title: urls})
  114. print(f'{title} 已保存')
  115. # 获取所有 url 数据之后, 存数据库
  116. # return all_data
  117. def download_img(load_data, target_file_path):
  118. print('正在下载图片')
  119. with ThreadPoolExecutor(max_workers=thread_count) as executor:
  120. executor.map(single_img_download,
  121. [(index, data, load_data, target_file_path) for index, data in enumerate(load_data)])
  122. def single_img_download(args):
  123. index, data, load_data, target_file_path = args
  124. # 连接数据库, 准备反写下载状态
  125. conn = psycopg2.connect(**psql_params)
  126. cursor = conn.cursor()
  127. # 如果img文件存在, 即已经下载过, 直接跳过
  128. id = data['id']
  129. name = data['name']
  130. target_site = data['target_site'],
  131. file_title = data['file_title'].replace(' ', '_')
  132. set_name = data['set_name']
  133. serial = str(data['serial']).zfill(3)
  134. image_suffix = data['image_suffix']
  135. img_url = data['img_url']
  136. # 查看每个合集的文件夹是否存在, 不存在就创建
  137. title_file_path = os.path.join(target_file_path, file_title)
  138. if not os.path.exists(title_file_path):
  139. os.mkdir(title_file_path)
  140. img_name = f'{file_title}_{serial}.{image_suffix}' # 图片文件名
  141. img_file_path = os.path.join(str(title_file_path), img_name) # 图片完整路径
  142. if os.path.exists(img_file_path):
  143. # 当此 img 已存在本地时, 在 psql 将数据库状态改为已下载
  144. query = f"UPDATE {target} SET download_state = %s WHERE id = %s"
  145. cursor.execute(query, (True, id))
  146. conn.commit()
  147. print(f'图片 {img_file_path} 已存在。继续!')
  148. return
  149. retry = 8
  150. while retry:
  151. try:
  152. resp = httpx.get(img_url, headers={
  153. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
  154. })
  155. with open(img_file_path, 'wb') as f:
  156. f.write(resp.content)
  157. # 下载成功后, 在 psql 将数据库状态改为已下载
  158. query = f"UPDATE {target} SET download_state = %s WHERE id = %s"
  159. cursor.execute(query, (True, id))
  160. conn.commit()
  161. # 算一下进度
  162. rate = index / len(load_data) * 100
  163. print(f'已下载:{img_name}, 当前第 {index + 1} 个, 共 {len(load_data)} 个, 已下载 {rate:.2f}%')
  164. time.sleep(random.uniform(1, 2))
  165. break
  166. except Exception as e:
  167. print(f'下载图片失败:{img_name}。错误:{e} 重试: {retry}')
  168. retry -= 1
  169. time.sleep(random.uniform(3, 5))
  170. conn.close()
  171. def save_data(data_item):
  172. conn = psycopg2.connect(**psql_params)
  173. cursor = conn.cursor()
  174. for k, v in data_item.items():
  175. for data in v:
  176. # 检查img_url是否重复
  177. cursor.execute("SELECT img_url FROM flaticon WHERE img_url = %s", (data['url'],))
  178. if cursor.fetchone() is None:
  179. # 插入数据
  180. cursor.execute("""
  181. INSERT INTO flaticon (name, target_site, file_title, set_name, serial, download_state, image_suffix, img_url)
  182. VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
  183. """, (
  184. None,
  185. target,
  186. data['file_title'],
  187. None,
  188. data['serial'],
  189. False,
  190. data['suffix'],
  191. data['url']
  192. ))
  193. conn.commit()
  194. # print(f"数据 {data['url']} 保存成功")
  195. else:
  196. print(f"数据 {data['url']} 已存在,跳过")
  197. # 关闭数据库连接
  198. cursor.close()
  199. conn.close()
  200. def load_data():
  201. # 连接数据库
  202. conn = psycopg2.connect(**psql_params)
  203. cursor = conn.cursor()
  204. # 查询download_state为false的所有数据
  205. query = f"SELECT * FROM {target} WHERE download_state = %s order by id asc"
  206. load_data_list = []
  207. try:
  208. # 执行查询
  209. cursor.execute(query, (False,))
  210. # 获取查询结果
  211. rows = cursor.fetchall()
  212. # 打印结果
  213. for row in rows:
  214. load_data_list.append(
  215. {
  216. 'id': row[0],
  217. 'name': row[1],
  218. 'target_site': row[2],
  219. 'file_title': row[3],
  220. 'set_name': row[4],
  221. 'serial': row[5],
  222. 'download_state': row[6],
  223. 'image_suffix': row[7],
  224. 'img_url': row[8]
  225. }
  226. )
  227. except psycopg2.Error as e:
  228. print(f"Database error: {e}")
  229. finally:
  230. # 关闭数据库连接
  231. cursor.close()
  232. conn.close()
  233. if load_data_list:
  234. return load_data_list
  235. else:
  236. print("没有需要下载的数据。")
  237. exit(0)
  238. def check_psql():
  239. # 连接数据库
  240. try:
  241. conn = psycopg2.connect(**psql_params)
  242. except Exception as e:
  243. print(f"无法连接到数据库:{e}")
  244. exit(1)
  245. # 创建cursor对象
  246. cur = conn.cursor()
  247. cur.execute("SELECT EXISTS(SELECT FROM pg_catalog.pg_tables WHERE schemaname = 'public' AND tablename = %s)",
  248. (target,))
  249. exist = cur.fetchone()[0]
  250. if not exist:
  251. # 如果不存在,则创建表
  252. cur.execute(f"""
  253. CREATE TABLE {target} (
  254. id SERIAL PRIMARY KEY,
  255. name VARCHAR(255),
  256. target_site VARCHAR(255),
  257. file_title VARCHAR(255),
  258. set_name VARCHAR(255),
  259. serial INT,
  260. download_state BOOLEAN,
  261. image_suffix VARCHAR(50),
  262. img_url VARCHAR(255)
  263. );
  264. """)
  265. print(f"表 '{target}' 创建成功。")
  266. # 提交事务
  267. conn.commit()
  268. # 关闭cursor和连接
  269. cur.close()
  270. conn.close()
  271. def check_local_downloads_dir():
  272. # 查看一下是否存在 downloads 文件夹, 不存在就创建一个
  273. download_file_path = os.path.join(str(project_root), 'downloads')
  274. if not os.path.exists(download_file_path):
  275. os.mkdir(download_file_path)
  276. target_file_path = os.path.join(download_file_path, target)
  277. if not os.path.exists(target_file_path):
  278. os.mkdir(target_file_path)
  279. return target_file_path
  280. def check_target_url_txt():
  281. txt_file_name = 'target_link.txt'
  282. if not os.path.exists(txt_file_name):
  283. with open(txt_file_name, 'w') as file:
  284. file.write('')
  285. print('需要在 target_link.txt 中填写目标链接')
  286. exit(0)
  287. else:
  288. with open('target_link.txt', 'r') as f:
  289. targets = [target.strip() for target in f.readlines()]
  290. if not targets:
  291. print('在 target_link.txt 中未找到目标链接')
  292. exit(0)
  293. return targets
  294. if __name__ == "__main__":
  295. # 检查数据库
  296. check_psql()
  297. if step == 1:
  298. targets = check_target_url_txt()
  299. open_browser(targets)
  300. elif step == 2:
  301. # 开始读取数据
  302. load_data = load_data()
  303. # 开始下载 img
  304. target_file_path = check_local_downloads_dir()
  305. download_img(load_data, target_file_path)
  306. print('下载完成, 程序退出')
  307. elif step == 3:
  308. targets = check_target_url_txt()
  309. # 保存 img 合集链接
  310. open_browser(targets)
  311. # 开始读取数据
  312. load_data = load_data()
  313. # 开始下载 img
  314. target_file_path = check_local_downloads_dir()
  315. download_img(load_data, target_file_path)
  316. print('下载完成, 程序退出')
  317. elif step == 4:
  318. # 调试
  319. pass
  320. else:
  321. pass