step2_download_img.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. # -*- coding: utf-8 -*-
  2. # step2 读取数据库, 下载未下载的图片
  3. import re
  4. import socket
  5. import sys
  6. import os
  7. import time
  8. import random
  9. from concurrent.futures import ThreadPoolExecutor
  10. import psycopg2
  11. import httpx
  12. sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
  13. project_root = os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection')
  14. class ImageCollectorStep2:
  15. def __init__(self):
  16. self.target = 'flaticon' # 整体目标文件夹/psql表名
  17. self.category = '' # 细分分类文件夹
  18. self.local_proxy = 0
  19. self.thread_count = 8
  20. self.title_selector = '#pack-view__inner > section.pack-view__header > h1'
  21. self.img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img'
  22. self.img_count_selector = '#pack-view__inner > section.pack-view__header > p'
  23. self.not_find_page_selector = '#viewport > div.errorpage.e404 > h1'
  24. self.psql_params = self.get_psql_params()
  25. def download_img(self, load_data, target_file_path):
  26. print('正在下载图片')
  27. with ThreadPoolExecutor(max_workers=self.thread_count) as executor:
  28. executor.map(self.single_img_download,
  29. [(index, data, load_data, target_file_path) for index, data in enumerate(load_data)])
  30. def single_img_download(self, args):
  31. index, data, load_data, target_file_path = args
  32. conn = psycopg2.connect(**self.psql_params)
  33. cursor = conn.cursor()
  34. id = data['id']
  35. name = data['name']
  36. target_site = data['target_site']
  37. file_title = data['file_title'].replace(' ', '_')
  38. set_name = data['set_name']
  39. serial = str(data['serial']).zfill(3)
  40. image_suffix = data['image_suffix']
  41. img_url = data['img_url']
  42. if self.category:
  43. category_path = os.path.join(target_file_path, self.category)
  44. if not os.path.exists(category_path):
  45. os.mkdir(category_path)
  46. title_file_path = os.path.join(category_path, file_title)
  47. if not os.path.exists(title_file_path):
  48. os.mkdir(title_file_path)
  49. # 否则,直接创建图片文件夹
  50. img_name = f'{file_title}_{serial}.{image_suffix}'
  51. img_file_path = os.path.join(title_file_path, img_name)
  52. else:
  53. title_file_path = os.path.join(target_file_path, file_title)
  54. if not os.path.exists(title_file_path):
  55. os.mkdir(title_file_path)
  56. # 否则,直接创建图片文件夹
  57. img_name = f'{file_title}_{serial}.{image_suffix}'
  58. img_file_path = os.path.join(title_file_path, img_name)
  59. if os.path.exists(img_file_path):
  60. query = f"UPDATE {self.target} SET download_state = %s WHERE id = %s"
  61. cursor.execute(query, (True, id))
  62. conn.commit()
  63. print(f'图片 {img_file_path} 已存在。继续!')
  64. return
  65. retry = 8
  66. while retry:
  67. try:
  68. resp = httpx.get(img_url, headers={
  69. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
  70. })
  71. with open(img_file_path, 'wb') as f:
  72. f.write(resp.content)
  73. query = f"UPDATE {self.target} SET download_state = %s WHERE id = %s"
  74. cursor.execute(query, (True, id))
  75. conn.commit()
  76. rate = index / len(load_data) * 100
  77. print(f'已下载:{img_name}, 当前第 {index + 1} 个, 共 {len(load_data)} 个, 已下载 {rate:.2f}%')
  78. time.sleep(random.uniform(1, 2))
  79. break
  80. except Exception as e:
  81. print(f'下载图片失败:{img_name}。错误:{e} 重试: {retry}')
  82. retry -= 1
  83. time.sleep(random.uniform(3, 5))
  84. conn.close()
  85. def load_data(self):
  86. conn = psycopg2.connect(**self.psql_params)
  87. cursor = conn.cursor()
  88. query = f"SELECT * FROM {self.target} WHERE download_state = %s order by id asc"
  89. load_data_list = []
  90. try:
  91. cursor.execute(query, (False,))
  92. rows = cursor.fetchall()
  93. for row in rows:
  94. load_data_list.append(
  95. {
  96. 'id': row[0],
  97. 'name': row[1],
  98. 'target_site': row[2],
  99. 'file_title': row[3],
  100. 'set_name': row[4],
  101. 'serial': row[5],
  102. 'download_state': row[6],
  103. 'image_suffix': row[7],
  104. 'img_url': row[8],
  105. 'category': row[9]
  106. }
  107. )
  108. except psycopg2.Error as e:
  109. print(f"Database error: {e}")
  110. finally:
  111. cursor.close()
  112. conn.close()
  113. if load_data_list:
  114. return load_data_list
  115. else:
  116. print("没有需要下载的数据。")
  117. return None
  118. def check_local_downloads_dir(self):
  119. download_file_path = os.path.join(str(project_root), 'downloads')
  120. if not os.path.exists(download_file_path):
  121. os.mkdir(download_file_path)
  122. target_file_path = os.path.join(download_file_path, self.target)
  123. if not os.path.exists(target_file_path):
  124. os.mkdir(target_file_path)
  125. return target_file_path
  126. def get_psql_params(self):
  127. s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  128. s.connect(('10.255.255.255', 1))
  129. IP = s.getsockname()[0]
  130. s.close()
  131. if '192.168.100' not in IP:
  132. return {
  133. "host": "home.erhe.link",
  134. "port": 55434,
  135. "user": "psql",
  136. "password": "psql",
  137. "dbname": "collect"
  138. }
  139. else:
  140. return {
  141. "host": "192.168.100.146",
  142. "port": 5434,
  143. "user": "psql",
  144. "password": "psql",
  145. "dbname": "collect"
  146. }
  147. if __name__ == '__main__':
  148. collector = ImageCollectorStep2()
  149. while True:
  150. load_data = collector.load_data()
  151. if not load_data:
  152. break
  153. target_file_path = collector.check_local_downloads_dir()
  154. collector.download_img(load_data, target_file_path)
  155. print('下载完成, 程序退出')