step1_get_img_set_url.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. # -*- coding: utf-8 -*-
  2. # 获取目标网站图片合集的所有图片, 存数据库
  3. import re
  4. import socket
  5. import sys
  6. import os
  7. import time
  8. import random
  9. import psycopg2
  10. import httpx
  11. from playwright.sync_api import sync_playwright
  12. sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
  13. project_root = os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection')
  14. class ImageCollectorStep1:
  15. def __init__(self, ):
  16. self.target = 'flaticon' # 整体目标文件夹/psql表名
  17. self.category = '' # 细分分类文件夹
  18. self.local_proxy = 0
  19. self.thread_count = 8
  20. self.title_selector = '#pack-view__inner > section.pack-view__header > h1'
  21. self.img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img'
  22. self.img_count_selector = '#pack-view__inner > section.pack-view__header > p'
  23. self.not_find_page_selector = '#viewport > div.errorpage.e404 > h1'
  24. self.psql_params = self.get_psql_params()
  25. def get_img_set_urls(self, target_urls):
  26. link_count = 1
  27. for target_url in target_urls:
  28. print(f'\n开始获取 {target_url} 数据, 当前链接是第 {link_count} 个, 共 {len(target_urls)} 个链接')
  29. link_count += 1
  30. pages = '/{}'
  31. urls = []
  32. title = ''
  33. total_page_count = 0
  34. img_count = 0
  35. try:
  36. headers = {
  37. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
  38. img_sequence_num = 1
  39. for page_count in range(1, 999):
  40. goto_url = target_url + pages.format(page_count)
  41. if self.local_proxy:
  42. proxies = {
  43. "http://": "http://127.0.0.1:7890",
  44. "https://": "http://127.0.0.1:7890",
  45. }
  46. with httpx.Client(proxies=proxies, headers=headers) as client:
  47. resp = client.get(goto_url, timeout=10)
  48. else:
  49. with httpx.Client(headers=headers) as client:
  50. resp = client.get(goto_url, timeout=10)
  51. resp.encoding = 'utf-8'
  52. page = resp.text
  53. if page_count == 1:
  54. # 在第一页, 获取 title
  55. title = re.findall(r'<span class="title">([\S\s]*?)</h1>', page)
  56. if title:
  57. title = title[0]
  58. invalid_chars = ['\n', '<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', ' ',
  59. 'Icon Pack ',
  60. 'span', 'class=title-style']
  61. for char in invalid_chars:
  62. title = title.replace(char, '')
  63. title = title.replace(' ', ' ')
  64. title = title.strip()
  65. else:
  66. print('title 获取失败')
  67. continue
  68. # 获取总共多少图片
  69. img_count = re.findall(
  70. '<p class="pack-view__header--icons new--badge">(.*?) <span class="uppercase">icons</span></p>',
  71. page)
  72. if img_count:
  73. img_count = int(img_count[0])
  74. else:
  75. # 不是第一页, 获取到没有下一页的情况, 就跳出
  76. errorpage = re.findall('<title>Oopsies... Seems like you got lost! - Flaticon</title>', page)
  77. if errorpage:
  78. break
  79. re_urls = re.findall('"contentUrl":"(.*?)"', page)
  80. for url in re_urls:
  81. src = url.replace('/128/', '/512/')
  82. suffix = src.split('.')[-1]
  83. sequence = str(img_sequence_num).zfill(3)
  84. urls.append({
  85. 'url': src,
  86. 'file_title': title,
  87. 'serial': sequence,
  88. 'category': self.category,
  89. 'img': f'{title}_{sequence}',
  90. 'suffix': suffix
  91. })
  92. img_sequence_num += 1
  93. time.sleep(random.uniform(1, 2))
  94. except Exception as e:
  95. print(f'{target_url} 链接获取失败: {str(e)}')
  96. self.save_error_urls(target_url)
  97. continue
  98. # 获取到数据之后, 存数据库
  99. self.save_data({title: urls})
  100. print(f'{title} 共有 {len(urls)} 张图片, 已保存')
  101. def open_browser(self, target_urls):
  102. link_count = 1
  103. for target_url in target_urls:
  104. print(f'\n开始获取 {target_url} 数据, 当前链接是第 {link_count} 个, 共 {len(target_urls)} 个链接')
  105. link_count += 1
  106. pages = '/{}'
  107. urls = []
  108. title = ''
  109. total_page_count = 0
  110. with sync_playwright() as playwright:
  111. if self.local_proxy:
  112. browser = playwright.chromium.launch(
  113. headless=True,
  114. proxy={"server": "http://127.0.0.1:7890"}
  115. )
  116. else:
  117. browser = playwright.chromium.launch(headless=True)
  118. context = browser.new_context(viewport={'width': 1280, 'height': 700})
  119. page = context.new_page()
  120. img_sequence_num = 1
  121. for page_count in range(1, 999):
  122. try:
  123. goto_url = target_url + pages.format(page_count)
  124. page.goto(goto_url, timeout=8000)
  125. except Exception as e:
  126. pass
  127. if page_count == 1:
  128. page.wait_for_selector(self.title_selector, state="attached", timeout=10000)
  129. title = page.query_selector(self.title_selector).inner_text()
  130. img_count = page.query_selector(self.img_count_selector).inner_text()
  131. img_count = int(img_count.split(' ')[0])
  132. invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', ' ', 'Icon Pack ']
  133. for char in invalid_chars:
  134. title = title.replace(char, '')
  135. else:
  136. try:
  137. page.wait_for_selector(self.not_find_page_selector, state="attached", timeout=2000)
  138. total_page_count = page_count - 1
  139. break
  140. except:
  141. pass
  142. for i in range(1, img_count + 1):
  143. elements = page.query_selector_all(self.img_selector.format(i))
  144. for element in elements:
  145. src = element.get_attribute('src')
  146. if src:
  147. src = src.replace('/128/', '/512/')
  148. suffix = src.split('.')[-1]
  149. sequence = str(img_sequence_num).zfill(3)
  150. urls.append({
  151. 'url': src,
  152. 'file_title': title,
  153. 'serial': sequence,
  154. 'category': self.category,
  155. 'img': f'{title}_{sequence}',
  156. 'suffix': suffix
  157. })
  158. img_sequence_num += 1
  159. break
  160. print(f'所有图片URL已获取。总页数: 共 {total_page_count} 页, 总共图片 {len(urls)}, 正在写入数据库...')
  161. page.close()
  162. browser.close()
  163. self.save_data({title: urls})
  164. print(f'{title} 已保存')
  165. def save_data(self, data_item):
  166. conn = psycopg2.connect(**self.psql_params)
  167. cursor = conn.cursor()
  168. for k, v in data_item.items():
  169. for data in v:
  170. cursor.execute(f"SELECT img_url FROM {self.target} WHERE img_url = %s", (data['url'],))
  171. if cursor.fetchone() is None:
  172. cursor.execute(f"""
  173. INSERT INTO {self.target} (name, target_site, file_title, set_name, serial, download_state, image_suffix, img_url, category)
  174. VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
  175. """, (
  176. None,
  177. self.target,
  178. data['file_title'],
  179. None,
  180. data['serial'],
  181. False,
  182. data['suffix'],
  183. data['url'],
  184. None,
  185. ))
  186. conn.commit()
  187. cursor.close()
  188. conn.close()
  189. def check_psql(self):
  190. try:
  191. conn = psycopg2.connect(**self.psql_params)
  192. except Exception as e:
  193. print(f"无法连接到数据库:{e}")
  194. exit(1)
  195. cur = conn.cursor()
  196. cur.execute("SELECT EXISTS(SELECT FROM pg_catalog.pg_tables WHERE schemaname = 'public' AND tablename = %s)",
  197. (self.target,))
  198. exist = cur.fetchone()[0]
  199. if not exist:
  200. cur.execute(f"""
  201. CREATE TABLE {self.target} (
  202. id SERIAL PRIMARY KEY,
  203. name VARCHAR(255),
  204. target_site VARCHAR(255),
  205. file_title VARCHAR(255),
  206. set_name VARCHAR(255),
  207. serial INT,
  208. download_state BOOLEAN,
  209. image_suffix VARCHAR(50),
  210. img_url TEXT,
  211. category VARCHAR(255)
  212. );
  213. """)
  214. print(f"表 '{self.target}' 创建成功。")
  215. conn.commit()
  216. cur.close()
  217. conn.close()
  218. def save_error_urls(self, error_url):
  219. error_txt_path = os.path.join(str(project_root), str(self.target), 'url_file_2error_url.txt')
  220. if not os.path.exists(error_txt_path):
  221. open(error_txt_path, 'w').close()
  222. with open(error_txt_path, 'r') as f:
  223. existing_urls = f.read().splitlines()
  224. if error_url in existing_urls:
  225. return
  226. with open(error_txt_path, 'a') as f:
  227. f.write(error_url + '\n')
  228. def check_target_url_txt(self):
  229. txt_file_name = 'url_file_1_target_link.txt'
  230. if not os.path.exists(txt_file_name):
  231. with open(txt_file_name, 'w') as file:
  232. file.write('')
  233. print('需要在 url_file_1_target_link.txt 中填写目标链接')
  234. exit(0)
  235. else:
  236. with open('url_file_1_target_link.txt', 'r') as f:
  237. targets = [target.strip() for target in f.readlines()]
  238. if not targets:
  239. print('在 url_file_1_target_link.txt 中未找到目标链接')
  240. exit(0)
  241. return targets
  242. def get_psql_params(self):
  243. s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  244. s.connect(('10.255.255.255', 1))
  245. IP = s.getsockname()[0]
  246. s.close()
  247. if '192.168.100' not in IP:
  248. return {
  249. "host": "home.erhe.link",
  250. "port": 55434,
  251. "user": "psql",
  252. "password": "psql",
  253. "dbname": "collect"
  254. }
  255. else:
  256. return {
  257. "host": "192.168.100.146",
  258. "port": 5434,
  259. "user": "psql",
  260. "password": "psql",
  261. "dbname": "collect"
  262. }
  263. if __name__ == '__main__':
  264. collector = ImageCollectorStep1()
  265. collector.check_psql()
  266. targets = collector.check_target_url_txt()
  267. # 下面二选一
  268. # collector.open_browser(targets) # 用浏览器
  269. collector.get_img_set_urls(targets) # 用 httpx