package main import ( "bufio" "context" "encoding/json" "fmt" "io" "log" "net/http" "net/url" "os" "path/filepath" "regexp" "strings" "sync" "time" "fyne.io/fyne/v2/widget" "golang.org/x/net/html" "golang.org/x/sync/semaphore" ) // -------------------- 全局变量 -------------------- var ( illegalChars = regexp.MustCompile(`[<>:"/\\|?*\x00-\x1F]`) imgLinkRegex = regexp.MustCompile(` %v", attempt, RetryPerPage, url, err) time.Sleep(time.Duration(1< %s", attempt, RetryPerPage, url, resp.Status) time.Sleep(time.Duration(1< %v", attempt, RetryPerPage, url, err) time.Sleep(time.Duration(1< 1 { links = append(links, match[1]) } } return links } // -------------------- 画廊爬取 -------------------- func crawlSingleGallery(client *http.Client, sem *semaphore.Weighted, galleryURL string, stats *CrawlStats) GalleryResult { // 获取信号量 ctx := context.Background() if err := sem.Acquire(ctx, 1); err != nil { return GalleryResult{URL: galleryURL, OK: false, Error: err} } defer sem.Release(1) // 解析基础URL和key baseURL := strings.TrimRight(galleryURL, "/") parsed, err := url.Parse(baseURL) if err != nil { return GalleryResult{URL: galleryURL, OK: false, Error: err} } pathParts := strings.Split(parsed.Path, "/") key := pathParts[len(pathParts)-1] jsonName := key + ".json" var folderPath string jsonData := make(map[string]string) imgCount := 1 lastPage := false for page := 0; page < MaxPage && !lastPage; page++ { pageURL := baseURL if page > 0 { pageURL = fmt.Sprintf("%s?p=%d", baseURL, page) } htmlContent, err := fetchPage(client, pageURL) if err != nil { log.Printf("获取页面失败 %s: %v", pageURL, err) continue } // 提取标题和创建文件夹 title := extractTitle(htmlContent) cleanTitle := cleanFolderName(title) folderPath = filepath.Join(DownloadsDir, cleanTitle) if err := os.MkdirAll(folderPath, 0755); err != nil { return GalleryResult{URL: galleryURL, OK: false, Error: err} } // 检查JSON文件是否已存在 jsonPath := filepath.Join(folderPath, jsonName) if _, err := os.Stat(jsonPath); err == nil { stats.mu.Lock() stats.Skipped++ stats.mu.Unlock() log.Printf("%s 已存在,跳过", jsonName) return GalleryResult{URL: galleryURL, OK: true} } log.Printf("当前页码:%d %s", page+1, pageURL) // 提取图片链接 links := extractImageLinks(htmlContent) if len(links) == 0 { log.Printf("本页无图片入口,视为最后一页") lastPage = true continue } // 处理图片链接 for _, link := range links { // 检查是否重复(简单的重复检测) isDuplicate := false for _, existingLink := range jsonData { if existingLink == link { isDuplicate = true lastPage = true break } } if isDuplicate { break } jsonData[fmt.Sprintf("%04d", imgCount)] = link imgCount++ } } // 保存JSON文件 if len(jsonData) > 0 { jsonPath := filepath.Join(folderPath, jsonName) data, err := json.MarshalIndent(jsonData, "", " ") if err != nil { return GalleryResult{URL: galleryURL, OK: false, Error: err} } if err := os.WriteFile(jsonPath, data, 0644); err != nil { return GalleryResult{URL: galleryURL, OK: false, Error: err} } log.Printf("保存成功 -> %s (%d 张)", jsonPath, len(jsonData)) stats.mu.Lock() stats.Success++ stats.mu.Unlock() return GalleryResult{URL: galleryURL, OK: true} } else { log.Printf("%s 未解析到任何图片链接", key) stats.mu.Lock() stats.Failed++ stats.mu.Unlock() return GalleryResult{URL: galleryURL, OK: false, Error: fmt.Errorf("未解析到图片链接")} } } // -------------------- 主流程 -------------------- func UrlDownloader(ip, port string, output *widget.Entry) { log.SetFlags(log.LstdFlags | log.Lshortfile) // 确保下载目录存在 if err := ensureDownloadsDir(); err != nil { log.Fatalf("创建下载目录失败: %v", err) } // 加载目标URL targets, err := loadTargets() if err != nil { log.Fatal(err) } // 加载失败记录 failed, err := loadFailedUrl() if err != nil { log.Printf("加载失败记录失败: %v", err) failed = []string{} } // 合并URL列表(去重) allURLs := make([]string, 0) seen := make(map[string]bool) // 优先添加失败记录 if len(failed) > 0 { log.Printf("优先重试上次失败画廊: %d 个", len(failed)) for _, url := range failed { if !seen[url] { allURLs = append(allURLs, url) seen[url] = true } } } // 添加新目标 for _, url := range targets { if !seen[url] { allURLs = append(allURLs, url) seen[url] = true } } if len(allURLs) == 0 { log.Println("没有需要处理的URL") return } log.Printf("开始处理 %d 个画廊", len(allURLs)) // 创建HTTP客户端 proxy := ip + port client := createHTTPClient(proxy) // 创建信号量控制并发 sem := semaphore.NewWeighted(int64(Concurrency)) stats := &CrawlStats{Total: len(allURLs)} // 使用WaitGroup等待所有任务完成 var wg sync.WaitGroup results := make(chan GalleryResult, len(allURLs)) // 启动所有爬取任务 for _, galleryURL := range allURLs { wg.Add(1) go func(url string) { defer wg.Done() result := crawlSingleGallery(client, sem, url, stats) results <- result }(galleryURL) } // 等待所有任务完成 wg.Wait() close(results) // 收集失败结果 var newFailed []string for result := range results { if !result.OK { newFailed = append(newFailed, result.URL) log.Printf("画廊处理失败 %s: %v", result.URL, result.Error) } } // 处理失败记录 if len(newFailed) > 0 { if err := saveFailedUrl(newFailed); err != nil { log.Printf("保存失败记录失败: %v", err) } else { log.Printf("本轮仍有 %d 个画廊失败,已写入 %s", len(newFailed), FailedRecordUrl) } } else { // 删除失败记录文件 if err := os.Remove(FailedRecordUrl); err != nil && !os.IsNotExist(err) { log.Printf("删除失败记录文件失败: %v", err) } else { log.Println("全部画廊抓取完成!") } } // 输出统计信息 log.Printf("统计信息: 总计=%d, 成功=%d, 失败=%d, 跳过=%d", stats.Total, stats.Success, stats.Failed, stats.Skipped) }