|
@@ -0,0 +1,421 @@
|
|
|
|
|
+package main
|
|
|
|
|
+
|
|
|
|
|
+import (
|
|
|
|
|
+ "bufio"
|
|
|
|
|
+ "context"
|
|
|
|
|
+ "encoding/json"
|
|
|
|
|
+ "fmt"
|
|
|
|
|
+ "io"
|
|
|
|
|
+ "log"
|
|
|
|
|
+ "net/http"
|
|
|
|
|
+ "net/url"
|
|
|
|
|
+ "os"
|
|
|
|
|
+ "path/filepath"
|
|
|
|
|
+ "regexp"
|
|
|
|
|
+ "strings"
|
|
|
|
|
+ "sync"
|
|
|
|
|
+ "time"
|
|
|
|
|
+
|
|
|
|
|
+ "fyne.io/fyne/v2/widget"
|
|
|
|
|
+ "golang.org/x/net/html"
|
|
|
|
|
+ "golang.org/x/sync/semaphore"
|
|
|
|
|
+)
|
|
|
|
|
+
|
|
|
|
|
+// -------------------- 全局变量 --------------------
|
|
|
|
|
+var (
|
|
|
|
|
+ illegalChars = regexp.MustCompile(`[<>:"/\\|?*\x00-\x1F]`)
|
|
|
|
|
+ imgLinkRegex = regexp.MustCompile(`<a href="(.*?)"`)
|
|
|
|
|
+)
|
|
|
|
|
+
|
|
|
|
|
+// -------------------- 数据结构 --------------------
|
|
|
|
|
+type GalleryResult struct {
|
|
|
|
|
+ URL string
|
|
|
|
|
+ OK bool
|
|
|
|
|
+ Error error
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+type CrawlStats struct {
|
|
|
|
|
+ Total int
|
|
|
|
|
+ Success int
|
|
|
|
|
+ Failed int
|
|
|
|
|
+ Skipped int
|
|
|
|
|
+ mu sync.Mutex
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// -------------------- 工具函数 --------------------
|
|
|
|
|
+func cleanFolderName(title string) string {
|
|
|
|
|
+ // 移除非法字符
|
|
|
|
|
+ clean := illegalChars.ReplaceAllString(title, "_")
|
|
|
|
|
+ // 移除空格和下划线
|
|
|
|
|
+ clean = strings.ReplaceAll(clean, " ", "")
|
|
|
|
|
+ clean = strings.ReplaceAll(clean, "_", "")
|
|
|
|
|
+ clean = strings.TrimSpace(clean)
|
|
|
|
|
+
|
|
|
|
|
+ if clean == "" {
|
|
|
|
|
+ return "gallery"
|
|
|
|
|
+ }
|
|
|
|
|
+ return clean
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func loadTargets() ([]string, error) {
|
|
|
|
|
+ file, err := os.Open(TargetsFile)
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ if os.IsNotExist(err) {
|
|
|
|
|
+ // 创建空文件
|
|
|
|
|
+ file, err := os.Create(TargetsFile)
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ return nil, fmt.Errorf("创建目标文件失败: %v", err)
|
|
|
|
|
+ }
|
|
|
|
|
+ file.Close()
|
|
|
|
|
+ return nil, fmt.Errorf("目标文件不存在,已自动创建,请先填写URL")
|
|
|
|
|
+ }
|
|
|
|
|
+ return nil, err
|
|
|
|
|
+ }
|
|
|
|
|
+ defer file.Close()
|
|
|
|
|
+
|
|
|
|
|
+ var targets []string
|
|
|
|
|
+ seen := make(map[string]bool)
|
|
|
|
|
+ scanner := bufio.NewScanner(file)
|
|
|
|
|
+ for scanner.Scan() {
|
|
|
|
|
+ line := strings.TrimSpace(scanner.Text())
|
|
|
|
|
+ if line != "" && !seen[line] {
|
|
|
|
|
+ targets = append(targets, line)
|
|
|
|
|
+ seen[line] = true
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if err := scanner.Err(); err != nil {
|
|
|
|
|
+ return nil, err
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if len(targets) == 0 {
|
|
|
|
|
+ return nil, fmt.Errorf("目标文件为空,请先填写URL")
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return targets, nil
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func loadFailedUrl() ([]string, error) {
|
|
|
|
|
+ data, err := os.ReadFile(FailedRecordUrl)
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ if os.IsNotExist(err) {
|
|
|
|
|
+ return []string{}, nil
|
|
|
|
|
+ }
|
|
|
|
|
+ return nil, err
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ var failed []string
|
|
|
|
|
+ err = json.Unmarshal(data, &failed)
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ return nil, err
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return failed, nil
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func saveFailedUrl(keys []string) error {
|
|
|
|
|
+ data, err := json.MarshalIndent(keys, "", " ")
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ return err
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return os.WriteFile(FailedRecordUrl, data, 0644)
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func ensureDownloadsDir() error {
|
|
|
|
|
+ return os.MkdirAll(DownloadsDir, 0755)
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func fetchPage(client *http.Client, url string) (string, error) {
|
|
|
|
|
+ var lastErr error
|
|
|
|
|
+
|
|
|
|
|
+ for attempt := 1; attempt <= RetryPerPage; attempt++ {
|
|
|
|
|
+ resp, err := client.Get(url)
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ lastErr = err
|
|
|
|
|
+ log.Printf("[%d/%d] 请求失败 %s -> %v", attempt, RetryPerPage, url, err)
|
|
|
|
|
+ time.Sleep(time.Duration(1<<uint(attempt)) * time.Second)
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if resp.StatusCode != http.StatusOK {
|
|
|
|
|
+ resp.Body.Close()
|
|
|
|
|
+ lastErr = fmt.Errorf("HTTP %d", resp.StatusCode)
|
|
|
|
|
+ log.Printf("[%d/%d] 请求失败 %s -> %s", attempt, RetryPerPage, url, resp.Status)
|
|
|
|
|
+ time.Sleep(time.Duration(1<<uint(attempt)) * time.Second)
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ body, err := io.ReadAll(resp.Body)
|
|
|
|
|
+ resp.Body.Close()
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ lastErr = err
|
|
|
|
|
+ log.Printf("[%d/%d] 读取响应失败 %s -> %v", attempt, RetryPerPage, url, err)
|
|
|
|
|
+ time.Sleep(time.Duration(1<<uint(attempt)) * time.Second)
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return string(body), nil
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return "", lastErr
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// -------------------- HTML 解析 --------------------
|
|
|
|
|
+func extractTitle(htmlContent string) string {
|
|
|
|
|
+ doc, err := html.Parse(strings.NewReader(htmlContent))
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ return "gallery"
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ var title string
|
|
|
|
|
+ var findTitle func(*html.Node)
|
|
|
|
|
+ findTitle = func(n *html.Node) {
|
|
|
|
|
+ if n.Type == html.ElementNode && n.Data == "title" {
|
|
|
|
|
+ if n.FirstChild != nil {
|
|
|
|
|
+ title = n.FirstChild.Data
|
|
|
|
|
+ return
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
|
|
|
+ findTitle(c)
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ findTitle(doc)
|
|
|
|
|
+
|
|
|
|
|
+ if title == "" {
|
|
|
|
|
+ return "gallery"
|
|
|
|
|
+ }
|
|
|
|
|
+ return title
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func extractImageLinks(htmlContent string) []string {
|
|
|
|
|
+ var links []string
|
|
|
|
|
+ matches := imgLinkRegex.FindAllStringSubmatch(htmlContent, -1)
|
|
|
|
|
+ for _, match := range matches {
|
|
|
|
|
+ if len(match) > 1 {
|
|
|
|
|
+ links = append(links, match[1])
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ return links
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// -------------------- 画廊爬取 --------------------
|
|
|
|
|
+func crawlSingleGallery(client *http.Client, sem *semaphore.Weighted, galleryURL string, stats *CrawlStats) GalleryResult {
|
|
|
|
|
+ // 获取信号量
|
|
|
|
|
+ ctx := context.Background()
|
|
|
|
|
+ if err := sem.Acquire(ctx, 1); err != nil {
|
|
|
|
|
+ return GalleryResult{URL: galleryURL, OK: false, Error: err}
|
|
|
|
|
+ }
|
|
|
|
|
+ defer sem.Release(1)
|
|
|
|
|
+
|
|
|
|
|
+ // 解析基础URL和key
|
|
|
|
|
+ baseURL := strings.TrimRight(galleryURL, "/")
|
|
|
|
|
+ parsed, err := url.Parse(baseURL)
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ return GalleryResult{URL: galleryURL, OK: false, Error: err}
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ pathParts := strings.Split(parsed.Path, "/")
|
|
|
|
|
+ key := pathParts[len(pathParts)-1]
|
|
|
|
|
+ jsonName := key + ".json"
|
|
|
|
|
+
|
|
|
|
|
+ var folderPath string
|
|
|
|
|
+ jsonData := make(map[string]string)
|
|
|
|
|
+ imgCount := 1
|
|
|
|
|
+ lastPage := false
|
|
|
|
|
+
|
|
|
|
|
+ for page := 0; page < MaxPage && !lastPage; page++ {
|
|
|
|
|
+ pageURL := baseURL
|
|
|
|
|
+ if page > 0 {
|
|
|
|
|
+ pageURL = fmt.Sprintf("%s?p=%d", baseURL, page)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ htmlContent, err := fetchPage(client, pageURL)
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ log.Printf("获取页面失败 %s: %v", pageURL, err)
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 提取标题和创建文件夹
|
|
|
|
|
+ title := extractTitle(htmlContent)
|
|
|
|
|
+ cleanTitle := cleanFolderName(title)
|
|
|
|
|
+ folderPath = filepath.Join(DownloadsDir, cleanTitle)
|
|
|
|
|
+
|
|
|
|
|
+ if err := os.MkdirAll(folderPath, 0755); err != nil {
|
|
|
|
|
+ return GalleryResult{URL: galleryURL, OK: false, Error: err}
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 检查JSON文件是否已存在
|
|
|
|
|
+ jsonPath := filepath.Join(folderPath, jsonName)
|
|
|
|
|
+ if _, err := os.Stat(jsonPath); err == nil {
|
|
|
|
|
+ stats.mu.Lock()
|
|
|
|
|
+ stats.Skipped++
|
|
|
|
|
+ stats.mu.Unlock()
|
|
|
|
|
+ log.Printf("%s 已存在,跳过", jsonName)
|
|
|
|
|
+ return GalleryResult{URL: galleryURL, OK: true}
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ log.Printf("当前页码:%d %s", page+1, pageURL)
|
|
|
|
|
+
|
|
|
|
|
+ // 提取图片链接
|
|
|
|
|
+ links := extractImageLinks(htmlContent)
|
|
|
|
|
+ if len(links) == 0 {
|
|
|
|
|
+ log.Printf("本页无图片入口,视为最后一页")
|
|
|
|
|
+ lastPage = true
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 处理图片链接
|
|
|
|
|
+ for _, link := range links {
|
|
|
|
|
+ // 检查是否重复(简单的重复检测)
|
|
|
|
|
+ isDuplicate := false
|
|
|
|
|
+ for _, existingLink := range jsonData {
|
|
|
|
|
+ if existingLink == link {
|
|
|
|
|
+ isDuplicate = true
|
|
|
|
|
+ lastPage = true
|
|
|
|
|
+ break
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ if isDuplicate {
|
|
|
|
|
+ break
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ jsonData[fmt.Sprintf("%04d", imgCount)] = link
|
|
|
|
|
+ imgCount++
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 保存JSON文件
|
|
|
|
|
+ if len(jsonData) > 0 {
|
|
|
|
|
+ jsonPath := filepath.Join(folderPath, jsonName)
|
|
|
|
|
+ data, err := json.MarshalIndent(jsonData, "", " ")
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ return GalleryResult{URL: galleryURL, OK: false, Error: err}
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if err := os.WriteFile(jsonPath, data, 0644); err != nil {
|
|
|
|
|
+ return GalleryResult{URL: galleryURL, OK: false, Error: err}
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ log.Printf("保存成功 -> %s (%d 张)", jsonPath, len(jsonData))
|
|
|
|
|
+ stats.mu.Lock()
|
|
|
|
|
+ stats.Success++
|
|
|
|
|
+ stats.mu.Unlock()
|
|
|
|
|
+ return GalleryResult{URL: galleryURL, OK: true}
|
|
|
|
|
+ } else {
|
|
|
|
|
+ log.Printf("%s 未解析到任何图片链接", key)
|
|
|
|
|
+ stats.mu.Lock()
|
|
|
|
|
+ stats.Failed++
|
|
|
|
|
+ stats.mu.Unlock()
|
|
|
|
|
+ return GalleryResult{URL: galleryURL, OK: false, Error: fmt.Errorf("未解析到图片链接")}
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// -------------------- 主流程 --------------------
|
|
|
|
|
+func UrlDownloader(ip, port string, output *widget.Entry) {
|
|
|
|
|
+ log.SetFlags(log.LstdFlags | log.Lshortfile)
|
|
|
|
|
+
|
|
|
|
|
+ // 确保下载目录存在
|
|
|
|
|
+ if err := ensureDownloadsDir(); err != nil {
|
|
|
|
|
+ log.Fatalf("创建下载目录失败: %v", err)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 加载目标URL
|
|
|
|
|
+ targets, err := loadTargets()
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ log.Fatal(err)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 加载失败记录
|
|
|
|
|
+ failed, err := loadFailedUrl()
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ log.Printf("加载失败记录失败: %v", err)
|
|
|
|
|
+ failed = []string{}
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 合并URL列表(去重)
|
|
|
|
|
+ allURLs := make([]string, 0)
|
|
|
|
|
+ seen := make(map[string]bool)
|
|
|
|
|
+
|
|
|
|
|
+ // 优先添加失败记录
|
|
|
|
|
+ if len(failed) > 0 {
|
|
|
|
|
+ log.Printf("优先重试上次失败画廊: %d 个", len(failed))
|
|
|
|
|
+ for _, url := range failed {
|
|
|
|
|
+ if !seen[url] {
|
|
|
|
|
+ allURLs = append(allURLs, url)
|
|
|
|
|
+ seen[url] = true
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 添加新目标
|
|
|
|
|
+ for _, url := range targets {
|
|
|
|
|
+ if !seen[url] {
|
|
|
|
|
+ allURLs = append(allURLs, url)
|
|
|
|
|
+ seen[url] = true
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if len(allURLs) == 0 {
|
|
|
|
|
+ log.Println("没有需要处理的URL")
|
|
|
|
|
+ return
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ log.Printf("开始处理 %d 个画廊", len(allURLs))
|
|
|
|
|
+
|
|
|
|
|
+ // 创建HTTP客户端
|
|
|
|
|
+ proxy := ip + port
|
|
|
|
|
+ client := createHTTPClient(proxy)
|
|
|
|
|
+
|
|
|
|
|
+ // 创建信号量控制并发
|
|
|
|
|
+ sem := semaphore.NewWeighted(int64(Concurrency))
|
|
|
|
|
+ stats := &CrawlStats{Total: len(allURLs)}
|
|
|
|
|
+
|
|
|
|
|
+ // 使用WaitGroup等待所有任务完成
|
|
|
|
|
+ var wg sync.WaitGroup
|
|
|
|
|
+ results := make(chan GalleryResult, len(allURLs))
|
|
|
|
|
+
|
|
|
|
|
+ // 启动所有爬取任务
|
|
|
|
|
+ for _, galleryURL := range allURLs {
|
|
|
|
|
+ wg.Add(1)
|
|
|
|
|
+ go func(url string) {
|
|
|
|
|
+ defer wg.Done()
|
|
|
|
|
+ result := crawlSingleGallery(client, sem, url, stats)
|
|
|
|
|
+ results <- result
|
|
|
|
|
+ }(galleryURL)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 等待所有任务完成
|
|
|
|
|
+ wg.Wait()
|
|
|
|
|
+ close(results)
|
|
|
|
|
+
|
|
|
|
|
+ // 收集失败结果
|
|
|
|
|
+ var newFailed []string
|
|
|
|
|
+ for result := range results {
|
|
|
|
|
+ if !result.OK {
|
|
|
|
|
+ newFailed = append(newFailed, result.URL)
|
|
|
|
|
+ log.Printf("画廊处理失败 %s: %v", result.URL, result.Error)
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 处理失败记录
|
|
|
|
|
+ if len(newFailed) > 0 {
|
|
|
|
|
+ if err := saveFailedUrl(newFailed); err != nil {
|
|
|
|
|
+ log.Printf("保存失败记录失败: %v", err)
|
|
|
|
|
+ } else {
|
|
|
|
|
+ log.Printf("本轮仍有 %d 个画廊失败,已写入 %s", len(newFailed), FailedRecordUrl)
|
|
|
|
|
+ }
|
|
|
|
|
+ } else {
|
|
|
|
|
+ // 删除失败记录文件
|
|
|
|
|
+ if err := os.Remove(FailedRecordUrl); err != nil && !os.IsNotExist(err) {
|
|
|
|
|
+ log.Printf("删除失败记录文件失败: %v", err)
|
|
|
|
|
+ } else {
|
|
|
|
|
+ log.Println("全部画廊抓取完成!")
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 输出统计信息
|
|
|
|
|
+ log.Printf("统计信息: 总计=%d, 成功=%d, 失败=%d, 跳过=%d",
|
|
|
|
|
+ stats.Total, stats.Success, stats.Failed, stats.Skipped)
|
|
|
|
|
+}
|