eh_art.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. # -*- coding: utf-8 -*-
  2. # download file
  3. import os
  4. import re
  5. import time
  6. import httpx
  7. from pathlib import Path
  8. from playwright.sync_api import sync_playwright
  9. # 确保保存图片的目录存在
  10. save_dir = Path("downloaded_images")
  11. save_dir.mkdir(parents=True, exist_ok=True)
  12. base_url = "https://e-hentai.org/g/"
  13. href_url = "3055404/7ce423edd8" # 每次改这个
  14. change_page = '?p='
  15. with sync_playwright() as playwright:
  16. browser = playwright.webkit.launch(headless=True) # 启动浏览器,headless=True 表示无头模式
  17. page = browser.new_page() # 创建新页面
  18. # 导航到网页
  19. page.goto(base_url + href_url)
  20. time.sleep(0.5)
  21. # 获取页面标题
  22. title = page.title()
  23. img_file = Path(f"{save_dir}/{title}")
  24. all_url_data = {}
  25. # TODO 此处循环获取每个页面的所有图片链接
  26. # 获取页面的 HTML 内容
  27. content = page.content()
  28. # print(content)
  29. view_img_list = re.findall('no-repeat"><a href="(.*?)">', content)
  30. all_view_img_list = [i for i in view_img_list]
  31. max_page_mun = re.findall('onclick="return false">(\\d*?)</a></td>', content)
  32. if max_page_mun:
  33. max_page_mun = int(max(max_page_mun))
  34. for view_page in range(1, max_page_mun):
  35. page.goto(base_url + href_url + change_page + str(view_page))
  36. content = page.content()
  37. view_img_list = re.findall('no-repeat"><a href="(.*?)">', content)
  38. for i in view_img_list:
  39. all_view_img_list.append(i)
  40. print(list(set(all_view_img_list)))
  41. # # 访问内层图片链接
  42. # for n, img_url in enumerate(view_img_list):
  43. # page.goto(img_url)
  44. # img_content = page.content()
  45. # b_img = re.findall('<img id="img" src="(.*?)">', content)
  46. # if b_img:
  47. # all_url_data.update({str(n).zfill(4): b_img[0]})
  48. #
  49. # print(all_url_data)
  50. # 关闭浏览器
  51. browser.close()