flaticon.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. # -*- coding: utf-8 -*-
  2. # 共两个步骤, 1, 将目标图片的信息拉到数据库(标题, 所有img的url), 2, 从数据库中读取对应目标站点的所有未下载过的img的url, 下载到本地
  3. # 需要安装psql, 并且 CREATE DATABASE collect; 运行会自动建表
  4. import re
  5. import socket
  6. import sys
  7. import os
  8. import time
  9. import random
  10. from concurrent.futures import ThreadPoolExecutor
  11. import psycopg2
  12. sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
  13. import httpx
  14. from playwright.sync_api import sync_playwright
  15. class ImageCollector:
  16. def __init__(self):
  17. self.target = 'flaticon' # 整体目标文件夹/psql表名
  18. self.category = '' # 细分分类文件夹
  19. self.step = 2 # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试
  20. self.local_proxy = 0
  21. self.thread_count = 1
  22. self.title_selector = '#pack-view__inner > section.pack-view__header > h1'
  23. self.img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img'
  24. self.img_count_selector = '#pack-view__inner > section.pack-view__header > p'
  25. self.not_find_page_selector = '#viewport > div.errorpage.e404 > h1'
  26. self.project_root = os.path.join(
  27. os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection')
  28. self.psql_params = self.get_psql_params()
  29. def get_img_set_urls(self, target_urls):
  30. link_count = 1
  31. for target_url in target_urls:
  32. print(f'\n开始获取 {target_url} 数据, 当前链接是第 {link_count} 个, 共 {len(target_urls)} 个链接')
  33. link_count += 1
  34. pages = '/{}'
  35. urls = []
  36. title = ''
  37. total_page_count = 0
  38. img_count = 0
  39. try:
  40. headers = {
  41. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
  42. img_sequence_num = 1
  43. for page_count in range(1, 999):
  44. goto_url = target_url + pages.format(page_count)
  45. if self.local_proxy:
  46. proxies = {
  47. "http://": "http://127.0.0.1:7890",
  48. "https://": "http://127.0.0.1:7890",
  49. }
  50. with httpx.Client(proxies=proxies, headers=headers) as client:
  51. resp = client.get(goto_url, timeout=10)
  52. else:
  53. with httpx.Client(headers=headers) as client:
  54. resp = client.get(goto_url, timeout=10)
  55. resp.encoding = 'utf-8'
  56. page = resp.text
  57. if page_count == 1:
  58. # 在第一页, 获取 title
  59. title = re.findall('<span class="title">([\S\s]*?)</h1>', page)
  60. if title:
  61. title = title[0]
  62. invalid_chars = ['\n', '<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', ' ',
  63. 'Icon Pack ',
  64. 'span', 'class=title-style']
  65. for char in invalid_chars:
  66. title = title.replace(char, '')
  67. title = title.replace(' ', ' ')
  68. else:
  69. print('title 获取失败')
  70. continue
  71. # 获取总共多少图片
  72. img_count = re.findall(
  73. '<p class="pack-view__header--icons new--badge">(.*?) <span class="uppercase">icons</span></p>',
  74. page)
  75. if img_count:
  76. img_count = int(img_count[0])
  77. else:
  78. # 不是第一页, 获取到没有下一页的情况, 就跳出
  79. errorpage = re.findall('<title>Oopsies... Seems like you got lost! - Flaticon</title>', page)
  80. if errorpage:
  81. break
  82. re_urls = re.findall('"contentUrl":"(.*?)"', page)
  83. for url in re_urls:
  84. src = url.replace('/128/', '/512/')
  85. suffix = src.split('.')[-1]
  86. sequence = str(img_sequence_num).zfill(3)
  87. urls.append({
  88. 'url': src,
  89. 'file_title': title,
  90. 'serial': sequence,
  91. 'category': self.category,
  92. 'img': f'{title}_{sequence}',
  93. 'suffix': suffix
  94. })
  95. img_sequence_num += 1
  96. time.sleep(random.uniform(1, 2))
  97. except Exception as e:
  98. print(f'{target_url} 链接获取失败: {str(e)}')
  99. self.save_error_urls(target_url)
  100. continue
  101. # 获取到数据之后, 存数据库
  102. self.save_data({title: urls})
  103. print(f'{title} 共有 {len(urls)} 张图片, 已保存')
  104. def open_browser(self, target_urls):
  105. link_count = 1
  106. for target_url in target_urls:
  107. print(f'\n开始获取 {target_url} 数据, 当前链接是第 {link_count} 个, 共 {len(target_urls)} 个链接')
  108. link_count += 1
  109. pages = '/{}'
  110. urls = []
  111. title = ''
  112. total_page_count = 0
  113. with sync_playwright() as playwright:
  114. if self.local_proxy:
  115. browser = playwright.chromium.launch(
  116. headless=True,
  117. proxy={"server": "http://127.0.0.1:7890"}
  118. )
  119. else:
  120. browser = playwright.chromium.launch(headless=True)
  121. context = browser.new_context(viewport={'width': 1280, 'height': 700})
  122. page = context.new_page()
  123. img_sequence_num = 1
  124. for page_count in range(1, 999):
  125. try:
  126. goto_url = target_url + pages.format(page_count)
  127. page.goto(goto_url, timeout=8000)
  128. except Exception as e:
  129. pass
  130. if page_count == 1:
  131. page.wait_for_selector(self.title_selector, state="attached", timeout=10000)
  132. title = page.query_selector(self.title_selector).inner_text()
  133. img_count = page.query_selector(self.img_count_selector).inner_text()
  134. img_count = int(img_count.split(' ')[0])
  135. invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', ' ', 'Icon Pack ']
  136. for char in invalid_chars:
  137. title = title.replace(char, '')
  138. else:
  139. try:
  140. page.wait_for_selector(self.not_find_page_selector, state="attached", timeout=2000)
  141. total_page_count = page_count - 1
  142. break
  143. except:
  144. pass
  145. for i in range(1, img_count + 1):
  146. elements = page.query_selector_all(self.img_selector.format(i))
  147. for element in elements:
  148. src = element.get_attribute('src')
  149. if src:
  150. src = src.replace('/128/', '/512/')
  151. suffix = src.split('.')[-1]
  152. sequence = str(img_sequence_num).zfill(3)
  153. urls.append({
  154. 'url': src,
  155. 'file_title': title,
  156. 'serial': sequence,
  157. 'category': self.category,
  158. 'img': f'{title}_{sequence}',
  159. 'suffix': suffix
  160. })
  161. img_sequence_num += 1
  162. break
  163. print(f'所有图片URL已获取。总页数: 共 {total_page_count} 页, 总共图片 {len(urls)}, 正在写入数据库...')
  164. page.close()
  165. browser.close()
  166. self.save_data({title: urls})
  167. print(f'{title} 已保存')
  168. def download_img(self, load_data, target_file_path):
  169. print('正在下载图片')
  170. with ThreadPoolExecutor(max_workers=self.thread_count) as executor:
  171. executor.map(self.single_img_download,
  172. [(index, data, load_data, target_file_path) for index, data in enumerate(load_data)])
  173. def single_img_download(self, args):
  174. index, data, load_data, target_file_path = args
  175. conn = psycopg2.connect(**self.psql_params)
  176. cursor = conn.cursor()
  177. id = data['id']
  178. name = data['name']
  179. target_site = data['target_site']
  180. file_title = data['file_title'].replace(' ', '_')
  181. set_name = data['set_name']
  182. serial = str(data['serial']).zfill(3)
  183. image_suffix = data['image_suffix']
  184. img_url = data['img_url']
  185. if self.category:
  186. category_path = os.path.join(target_file_path, self.category)
  187. if not os.path.exists(category_path):
  188. os.mkdir(category_path)
  189. title_file_path = os.path.join(category_path, file_title)
  190. if not os.path.exists(title_file_path):
  191. os.mkdir(title_file_path)
  192. # 否则,直接创建图片文件夹
  193. img_name = f'{file_title}_{serial}.{image_suffix}'
  194. img_file_path = os.path.join(title_file_path, img_name)
  195. else:
  196. title_file_path = os.path.join(target_file_path, file_title)
  197. if not os.path.exists(title_file_path):
  198. os.mkdir(title_file_path)
  199. # 否则,直接创建图片文件夹
  200. img_name = f'{file_title}_{serial}.{image_suffix}'
  201. img_file_path = os.path.join(title_file_path, img_name)
  202. if os.path.exists(img_file_path):
  203. query = f"UPDATE {self.target} SET download_state = %s WHERE id = %s"
  204. cursor.execute(query, (True, id))
  205. conn.commit()
  206. print(f'图片 {img_file_path} 已存在。继续!')
  207. return
  208. retry = 8
  209. while retry:
  210. try:
  211. resp = httpx.get(img_url, headers={
  212. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
  213. })
  214. with open(img_file_path, 'wb') as f:
  215. f.write(resp.content)
  216. query = f"UPDATE {self.target} SET download_state = %s WHERE id = %s"
  217. cursor.execute(query, (True, id))
  218. conn.commit()
  219. rate = index / len(load_data) * 100
  220. print(f'已下载:{img_name}, 当前第 {index + 1} 个, 共 {len(load_data)} 个, 已下载 {rate:.2f}%')
  221. time.sleep(random.uniform(1, 2))
  222. break
  223. except Exception as e:
  224. print(f'下载图片失败:{img_name}。错误:{e} 重试: {retry}')
  225. retry -= 1
  226. time.sleep(random.uniform(3, 5))
  227. conn.close()
  228. def save_data(self, data_item):
  229. conn = psycopg2.connect(**self.psql_params)
  230. cursor = conn.cursor()
  231. for k, v in data_item.items():
  232. for data in v:
  233. cursor.execute(f"SELECT img_url FROM {self.target} WHERE img_url = %s", (data['url'],))
  234. if cursor.fetchone() is None:
  235. cursor.execute(f"""
  236. INSERT INTO {self.target} (name, target_site, file_title, set_name, serial, download_state, image_suffix, img_url, category)
  237. VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
  238. """, (
  239. None,
  240. self.target,
  241. data['file_title'],
  242. None,
  243. data['serial'],
  244. False,
  245. data['suffix'],
  246. data['url'],
  247. None,
  248. ))
  249. conn.commit()
  250. cursor.close()
  251. conn.close()
  252. def load_data(self):
  253. conn = psycopg2.connect(**self.psql_params)
  254. cursor = conn.cursor()
  255. query = f"SELECT * FROM {self.target} WHERE download_state = %s order by id asc"
  256. load_data_list = []
  257. try:
  258. cursor.execute(query, (False,))
  259. rows = cursor.fetchall()
  260. for row in rows:
  261. load_data_list.append(
  262. {
  263. 'id': row[0],
  264. 'name': row[1],
  265. 'target_site': row[2],
  266. 'file_title': row[3],
  267. 'set_name': row[4],
  268. 'serial': row[5],
  269. 'download_state': row[6],
  270. 'image_suffix': row[7],
  271. 'img_url': row[8],
  272. 'category': row[9]
  273. }
  274. )
  275. except psycopg2.Error as e:
  276. print(f"Database error: {e}")
  277. finally:
  278. cursor.close()
  279. conn.close()
  280. if load_data_list:
  281. return load_data_list
  282. else:
  283. print("没有需要下载的数据。")
  284. exit(0)
  285. def check_psql(self):
  286. try:
  287. conn = psycopg2.connect(**self.psql_params)
  288. except Exception as e:
  289. print(f"无法连接到数据库:{e}")
  290. exit(1)
  291. cur = conn.cursor()
  292. cur.execute("SELECT EXISTS(SELECT FROM pg_catalog.pg_tables WHERE schemaname = 'public' AND tablename = %s)",
  293. (self.target,))
  294. exist = cur.fetchone()[0]
  295. if not exist:
  296. cur.execute(f"""
  297. CREATE TABLE {self.target} (
  298. id SERIAL PRIMARY KEY,
  299. name VARCHAR(255),
  300. target_site VARCHAR(255),
  301. file_title VARCHAR(255),
  302. set_name VARCHAR(255),
  303. serial INT,
  304. download_state BOOLEAN,
  305. image_suffix VARCHAR(50),
  306. img_url TEXT,
  307. category VARCHAR(255)
  308. );
  309. """)
  310. print(f"表 '{self.target}' 创建成功。")
  311. conn.commit()
  312. cur.close()
  313. conn.close()
  314. def check_local_downloads_dir(self):
  315. download_file_path = os.path.join(str(self.project_root), 'downloads')
  316. if not os.path.exists(download_file_path):
  317. os.mkdir(download_file_path)
  318. target_file_path = os.path.join(download_file_path, self.target)
  319. if not os.path.exists(target_file_path):
  320. os.mkdir(target_file_path)
  321. return target_file_path
  322. def check_target_url_txt(self):
  323. txt_file_name = 'target_link.txt'
  324. if not os.path.exists(txt_file_name):
  325. with open(txt_file_name, 'w') as file:
  326. file.write('')
  327. print('需要在 target_link.txt 中填写目标链接')
  328. exit(0)
  329. else:
  330. with open('target_link.txt', 'r') as f:
  331. targets = [target.strip() for target in f.readlines()]
  332. if not targets:
  333. print('在 target_link.txt 中未找到目标链接')
  334. exit(0)
  335. return targets
  336. def get_psql_params(self):
  337. s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  338. s.connect(('10.255.255.255', 1))
  339. IP = s.getsockname()[0]
  340. s.close()
  341. if '192.168.100' not in IP:
  342. return {
  343. "host": "home.erhe.link",
  344. "port": 55434,
  345. "user": "psql",
  346. "password": "psql",
  347. "dbname": "collect"
  348. }
  349. else:
  350. return {
  351. "host": "192.168.100.146",
  352. "port": 5434,
  353. "user": "psql",
  354. "password": "psql",
  355. "dbname": "collect"
  356. }
  357. def save_error_urls(self, error_url):
  358. error_txt_path = os.path.join(str(self.project_root), str(self.target), 'error_url.txt')
  359. if not os.path.exists(error_txt_path):
  360. open(error_txt_path, 'w').close()
  361. with open(error_txt_path, 'r') as f:
  362. existing_urls = f.read().splitlines()
  363. if error_url in existing_urls:
  364. return
  365. with open(error_txt_path, 'a') as f:
  366. f.write(error_url + '\n')
  367. if __name__ == '__main__':
  368. collector = ImageCollector()
  369. collector.check_psql()
  370. if collector.step == 1:
  371. targets = collector.check_target_url_txt()
  372. # 下面二选一
  373. # collector.open_browser(targets) # 用浏览器
  374. collector.get_img_set_urls(targets) # 用 httpx
  375. elif collector.step == 2:
  376. load_data = collector.load_data()
  377. target_file_path = collector.check_local_downloads_dir()
  378. collector.download_img(load_data, target_file_path)
  379. print('下载完成, 程序退出')
  380. elif collector.step == 3:
  381. targets = collector.check_target_url_txt()
  382. # 下面二选一
  383. # collector.open_browser(targets) # 用浏览器
  384. collector.get_img_set_urls(targets) # 用 httpx
  385. load_data = collector.load_data()
  386. target_file_path = collector.check_local_downloads_dir()
  387. collector.download_img(load_data, target_file_path)
  388. print('下载完成, 程序退出')
  389. elif collector.step == 4:
  390. pass
  391. else:
  392. pass