1step.go 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. package main
  2. import (
  3. "bufio"
  4. "context"
  5. "encoding/json"
  6. "fmt"
  7. "io"
  8. "log"
  9. "net/http"
  10. "net/url"
  11. "os"
  12. "path/filepath"
  13. "regexp"
  14. "strings"
  15. "sync"
  16. "time"
  17. "fyne.io/fyne/v2/widget"
  18. "golang.org/x/net/html"
  19. "golang.org/x/sync/semaphore"
  20. )
  21. // -------------------- 全局变量 --------------------
  22. var (
  23. illegalChars = regexp.MustCompile(`[<>:"/\\|?*\x00-\x1F]`)
  24. imgLinkRegex = regexp.MustCompile(`<a href="(.*?)"`)
  25. )
  26. // -------------------- 数据结构 --------------------
  27. type GalleryResult struct {
  28. URL string
  29. OK bool
  30. Error error
  31. }
  32. type CrawlStats struct {
  33. Total int
  34. Success int
  35. Failed int
  36. Skipped int
  37. mu sync.Mutex
  38. }
  39. // -------------------- 工具函数 --------------------
  40. func cleanFolderName(title string) string {
  41. // 移除非法字符
  42. clean := illegalChars.ReplaceAllString(title, "_")
  43. // 移除空格和下划线
  44. clean = strings.ReplaceAll(clean, " ", "")
  45. clean = strings.ReplaceAll(clean, "_", "")
  46. clean = strings.TrimSpace(clean)
  47. if clean == "" {
  48. return "gallery"
  49. }
  50. return clean
  51. }
  52. func loadTargets() ([]string, error) {
  53. file, err := os.Open(TargetsFile)
  54. if err != nil {
  55. if os.IsNotExist(err) {
  56. // 创建空文件
  57. file, err := os.Create(TargetsFile)
  58. if err != nil {
  59. return nil, fmt.Errorf("创建目标文件失败: %v", err)
  60. }
  61. file.Close()
  62. return nil, fmt.Errorf("目标文件不存在,已自动创建,请先填写URL")
  63. }
  64. return nil, err
  65. }
  66. defer file.Close()
  67. var targets []string
  68. seen := make(map[string]bool)
  69. scanner := bufio.NewScanner(file)
  70. for scanner.Scan() {
  71. line := strings.TrimSpace(scanner.Text())
  72. if line != "" && !seen[line] {
  73. targets = append(targets, line)
  74. seen[line] = true
  75. }
  76. }
  77. if err := scanner.Err(); err != nil {
  78. return nil, err
  79. }
  80. if len(targets) == 0 {
  81. return nil, fmt.Errorf("目标文件为空,请先填写URL")
  82. }
  83. return targets, nil
  84. }
  85. func loadFailedUrl() ([]string, error) {
  86. data, err := os.ReadFile(FailedRecordUrl)
  87. if err != nil {
  88. if os.IsNotExist(err) {
  89. return []string{}, nil
  90. }
  91. return nil, err
  92. }
  93. var failed []string
  94. err = json.Unmarshal(data, &failed)
  95. if err != nil {
  96. return nil, err
  97. }
  98. return failed, nil
  99. }
  100. func saveFailedUrl(keys []string) error {
  101. data, err := json.MarshalIndent(keys, "", " ")
  102. if err != nil {
  103. return err
  104. }
  105. return os.WriteFile(FailedRecordUrl, data, 0644)
  106. }
  107. func ensureDownloadsDir() error {
  108. return os.MkdirAll(DownloadsDir, 0755)
  109. }
  110. func fetchPage(client *http.Client, url string) (string, error) {
  111. var lastErr error
  112. for attempt := 1; attempt <= RetryPerPage; attempt++ {
  113. resp, err := client.Get(url)
  114. if err != nil {
  115. lastErr = err
  116. log.Printf("[%d/%d] 请求失败 %s -> %v", attempt, RetryPerPage, url, err)
  117. time.Sleep(time.Duration(1<<uint(attempt)) * time.Second)
  118. continue
  119. }
  120. if resp.StatusCode != http.StatusOK {
  121. resp.Body.Close()
  122. lastErr = fmt.Errorf("HTTP %d", resp.StatusCode)
  123. log.Printf("[%d/%d] 请求失败 %s -> %s", attempt, RetryPerPage, url, resp.Status)
  124. time.Sleep(time.Duration(1<<uint(attempt)) * time.Second)
  125. continue
  126. }
  127. body, err := io.ReadAll(resp.Body)
  128. resp.Body.Close()
  129. if err != nil {
  130. lastErr = err
  131. log.Printf("[%d/%d] 读取响应失败 %s -> %v", attempt, RetryPerPage, url, err)
  132. time.Sleep(time.Duration(1<<uint(attempt)) * time.Second)
  133. continue
  134. }
  135. return string(body), nil
  136. }
  137. return "", lastErr
  138. }
  139. // -------------------- HTML 解析 --------------------
  140. func extractTitle(htmlContent string) string {
  141. doc, err := html.Parse(strings.NewReader(htmlContent))
  142. if err != nil {
  143. return "gallery"
  144. }
  145. var title string
  146. var findTitle func(*html.Node)
  147. findTitle = func(n *html.Node) {
  148. if n.Type == html.ElementNode && n.Data == "title" {
  149. if n.FirstChild != nil {
  150. title = n.FirstChild.Data
  151. return
  152. }
  153. }
  154. for c := n.FirstChild; c != nil; c = c.NextSibling {
  155. findTitle(c)
  156. }
  157. }
  158. findTitle(doc)
  159. if title == "" {
  160. return "gallery"
  161. }
  162. return title
  163. }
  164. func extractImageLinks(htmlContent string) []string {
  165. var links []string
  166. matches := imgLinkRegex.FindAllStringSubmatch(htmlContent, -1)
  167. for _, match := range matches {
  168. if len(match) > 1 {
  169. links = append(links, match[1])
  170. }
  171. }
  172. return links
  173. }
  174. // -------------------- 画廊爬取 --------------------
  175. func crawlSingleGallery(client *http.Client, sem *semaphore.Weighted, galleryURL string, stats *CrawlStats) GalleryResult {
  176. // 获取信号量
  177. ctx := context.Background()
  178. if err := sem.Acquire(ctx, 1); err != nil {
  179. return GalleryResult{URL: galleryURL, OK: false, Error: err}
  180. }
  181. defer sem.Release(1)
  182. // 解析基础URL和key
  183. baseURL := strings.TrimRight(galleryURL, "/")
  184. parsed, err := url.Parse(baseURL)
  185. if err != nil {
  186. return GalleryResult{URL: galleryURL, OK: false, Error: err}
  187. }
  188. pathParts := strings.Split(parsed.Path, "/")
  189. key := pathParts[len(pathParts)-1]
  190. jsonName := key + ".json"
  191. var folderPath string
  192. jsonData := make(map[string]string)
  193. imgCount := 1
  194. lastPage := false
  195. for page := 0; page < MaxPage && !lastPage; page++ {
  196. pageURL := baseURL
  197. if page > 0 {
  198. pageURL = fmt.Sprintf("%s?p=%d", baseURL, page)
  199. }
  200. htmlContent, err := fetchPage(client, pageURL)
  201. if err != nil {
  202. log.Printf("获取页面失败 %s: %v", pageURL, err)
  203. continue
  204. }
  205. // 提取标题和创建文件夹
  206. title := extractTitle(htmlContent)
  207. cleanTitle := cleanFolderName(title)
  208. folderPath = filepath.Join(DownloadsDir, cleanTitle)
  209. if err := os.MkdirAll(folderPath, 0755); err != nil {
  210. return GalleryResult{URL: galleryURL, OK: false, Error: err}
  211. }
  212. // 检查JSON文件是否已存在
  213. jsonPath := filepath.Join(folderPath, jsonName)
  214. if _, err := os.Stat(jsonPath); err == nil {
  215. stats.mu.Lock()
  216. stats.Skipped++
  217. stats.mu.Unlock()
  218. log.Printf("%s 已存在,跳过", jsonName)
  219. return GalleryResult{URL: galleryURL, OK: true}
  220. }
  221. log.Printf("当前页码:%d %s", page+1, pageURL)
  222. // 提取图片链接
  223. links := extractImageLinks(htmlContent)
  224. if len(links) == 0 {
  225. log.Printf("本页无图片入口,视为最后一页")
  226. lastPage = true
  227. continue
  228. }
  229. // 处理图片链接
  230. for _, link := range links {
  231. // 检查是否重复(简单的重复检测)
  232. isDuplicate := false
  233. for _, existingLink := range jsonData {
  234. if existingLink == link {
  235. isDuplicate = true
  236. lastPage = true
  237. break
  238. }
  239. }
  240. if isDuplicate {
  241. break
  242. }
  243. jsonData[fmt.Sprintf("%04d", imgCount)] = link
  244. imgCount++
  245. }
  246. }
  247. // 保存JSON文件
  248. if len(jsonData) > 0 {
  249. jsonPath := filepath.Join(folderPath, jsonName)
  250. data, err := json.MarshalIndent(jsonData, "", " ")
  251. if err != nil {
  252. return GalleryResult{URL: galleryURL, OK: false, Error: err}
  253. }
  254. if err := os.WriteFile(jsonPath, data, 0644); err != nil {
  255. return GalleryResult{URL: galleryURL, OK: false, Error: err}
  256. }
  257. log.Printf("保存成功 -> %s (%d 张)", jsonPath, len(jsonData))
  258. stats.mu.Lock()
  259. stats.Success++
  260. stats.mu.Unlock()
  261. return GalleryResult{URL: galleryURL, OK: true}
  262. } else {
  263. log.Printf("%s 未解析到任何图片链接", key)
  264. stats.mu.Lock()
  265. stats.Failed++
  266. stats.mu.Unlock()
  267. return GalleryResult{URL: galleryURL, OK: false, Error: fmt.Errorf("未解析到图片链接")}
  268. }
  269. }
  270. // -------------------- 主流程 --------------------
  271. func UrlDownloader(ip, port string, output *widget.Entry) {
  272. log.SetFlags(log.LstdFlags | log.Lshortfile)
  273. // 确保下载目录存在
  274. if err := ensureDownloadsDir(); err != nil {
  275. log.Fatalf("创建下载目录失败: %v", err)
  276. }
  277. // 加载目标URL
  278. targets, err := loadTargets()
  279. if err != nil {
  280. log.Fatal(err)
  281. }
  282. // 加载失败记录
  283. failed, err := loadFailedUrl()
  284. if err != nil {
  285. log.Printf("加载失败记录失败: %v", err)
  286. failed = []string{}
  287. }
  288. // 合并URL列表(去重)
  289. allURLs := make([]string, 0)
  290. seen := make(map[string]bool)
  291. // 优先添加失败记录
  292. if len(failed) > 0 {
  293. log.Printf("优先重试上次失败画廊: %d 个", len(failed))
  294. for _, url := range failed {
  295. if !seen[url] {
  296. allURLs = append(allURLs, url)
  297. seen[url] = true
  298. }
  299. }
  300. }
  301. // 添加新目标
  302. for _, url := range targets {
  303. if !seen[url] {
  304. allURLs = append(allURLs, url)
  305. seen[url] = true
  306. }
  307. }
  308. if len(allURLs) == 0 {
  309. log.Println("没有需要处理的URL")
  310. return
  311. }
  312. log.Printf("开始处理 %d 个画廊", len(allURLs))
  313. // 创建HTTP客户端
  314. proxy := ip + port
  315. client := createHTTPClient(proxy)
  316. // 创建信号量控制并发
  317. sem := semaphore.NewWeighted(int64(Concurrency))
  318. stats := &CrawlStats{Total: len(allURLs)}
  319. // 使用WaitGroup等待所有任务完成
  320. var wg sync.WaitGroup
  321. results := make(chan GalleryResult, len(allURLs))
  322. // 启动所有爬取任务
  323. for _, galleryURL := range allURLs {
  324. wg.Add(1)
  325. go func(url string) {
  326. defer wg.Done()
  327. result := crawlSingleGallery(client, sem, url, stats)
  328. results <- result
  329. }(galleryURL)
  330. }
  331. // 等待所有任务完成
  332. wg.Wait()
  333. close(results)
  334. // 收集失败结果
  335. var newFailed []string
  336. for result := range results {
  337. if !result.OK {
  338. newFailed = append(newFailed, result.URL)
  339. log.Printf("画廊处理失败 %s: %v", result.URL, result.Error)
  340. }
  341. }
  342. // 处理失败记录
  343. if len(newFailed) > 0 {
  344. if err := saveFailedUrl(newFailed); err != nil {
  345. log.Printf("保存失败记录失败: %v", err)
  346. } else {
  347. log.Printf("本轮仍有 %d 个画廊失败,已写入 %s", len(newFailed), FailedRecordUrl)
  348. }
  349. } else {
  350. // 删除失败记录文件
  351. if err := os.Remove(FailedRecordUrl); err != nil && !os.IsNotExist(err) {
  352. log.Printf("删除失败记录文件失败: %v", err)
  353. } else {
  354. log.Println("全部画廊抓取完成!")
  355. }
  356. }
  357. // 输出统计信息
  358. log.Printf("统计信息: 总计=%d, 成功=%d, 失败=%d, 跳过=%d",
  359. stats.Total, stats.Success, stats.Failed, stats.Skipped)
  360. }