| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421 |
- package main
- import (
- "bufio"
- "context"
- "encoding/json"
- "fmt"
- "io"
- "log"
- "net/http"
- "net/url"
- "os"
- "path/filepath"
- "regexp"
- "strings"
- "sync"
- "time"
- "fyne.io/fyne/v2/widget"
- "golang.org/x/net/html"
- "golang.org/x/sync/semaphore"
- )
- // -------------------- 全局变量 --------------------
- var (
- illegalChars = regexp.MustCompile(`[<>:"/\\|?*\x00-\x1F]`)
- imgLinkRegex = regexp.MustCompile(`<a href="(.*?)"`)
- )
- // -------------------- 数据结构 --------------------
- type GalleryResult struct {
- URL string
- OK bool
- Error error
- }
- type CrawlStats struct {
- Total int
- Success int
- Failed int
- Skipped int
- mu sync.Mutex
- }
- // -------------------- 工具函数 --------------------
- func cleanFolderName(title string) string {
- // 移除非法字符
- clean := illegalChars.ReplaceAllString(title, "_")
- // 移除空格和下划线
- clean = strings.ReplaceAll(clean, " ", "")
- clean = strings.ReplaceAll(clean, "_", "")
- clean = strings.TrimSpace(clean)
- if clean == "" {
- return "gallery"
- }
- return clean
- }
- func loadTargets() ([]string, error) {
- file, err := os.Open(TargetsFile)
- if err != nil {
- if os.IsNotExist(err) {
- // 创建空文件
- file, err := os.Create(TargetsFile)
- if err != nil {
- return nil, fmt.Errorf("创建目标文件失败: %v", err)
- }
- file.Close()
- return nil, fmt.Errorf("目标文件不存在,已自动创建,请先填写URL")
- }
- return nil, err
- }
- defer file.Close()
- var targets []string
- seen := make(map[string]bool)
- scanner := bufio.NewScanner(file)
- for scanner.Scan() {
- line := strings.TrimSpace(scanner.Text())
- if line != "" && !seen[line] {
- targets = append(targets, line)
- seen[line] = true
- }
- }
- if err := scanner.Err(); err != nil {
- return nil, err
- }
- if len(targets) == 0 {
- return nil, fmt.Errorf("目标文件为空,请先填写URL")
- }
- return targets, nil
- }
- func loadFailedUrl() ([]string, error) {
- data, err := os.ReadFile(FailedRecordUrl)
- if err != nil {
- if os.IsNotExist(err) {
- return []string{}, nil
- }
- return nil, err
- }
- var failed []string
- err = json.Unmarshal(data, &failed)
- if err != nil {
- return nil, err
- }
- return failed, nil
- }
- func saveFailedUrl(keys []string) error {
- data, err := json.MarshalIndent(keys, "", " ")
- if err != nil {
- return err
- }
- return os.WriteFile(FailedRecordUrl, data, 0644)
- }
- func ensureDownloadsDir() error {
- return os.MkdirAll(DownloadsDir, 0755)
- }
- func fetchPage(client *http.Client, url string) (string, error) {
- var lastErr error
- for attempt := 1; attempt <= RetryPerPage; attempt++ {
- resp, err := client.Get(url)
- if err != nil {
- lastErr = err
- log.Printf("[%d/%d] 请求失败 %s -> %v", attempt, RetryPerPage, url, err)
- time.Sleep(time.Duration(1<<uint(attempt)) * time.Second)
- continue
- }
- if resp.StatusCode != http.StatusOK {
- resp.Body.Close()
- lastErr = fmt.Errorf("HTTP %d", resp.StatusCode)
- log.Printf("[%d/%d] 请求失败 %s -> %s", attempt, RetryPerPage, url, resp.Status)
- time.Sleep(time.Duration(1<<uint(attempt)) * time.Second)
- continue
- }
- body, err := io.ReadAll(resp.Body)
- resp.Body.Close()
- if err != nil {
- lastErr = err
- log.Printf("[%d/%d] 读取响应失败 %s -> %v", attempt, RetryPerPage, url, err)
- time.Sleep(time.Duration(1<<uint(attempt)) * time.Second)
- continue
- }
- return string(body), nil
- }
- return "", lastErr
- }
- // -------------------- HTML 解析 --------------------
- func extractTitle(htmlContent string) string {
- doc, err := html.Parse(strings.NewReader(htmlContent))
- if err != nil {
- return "gallery"
- }
- var title string
- var findTitle func(*html.Node)
- findTitle = func(n *html.Node) {
- if n.Type == html.ElementNode && n.Data == "title" {
- if n.FirstChild != nil {
- title = n.FirstChild.Data
- return
- }
- }
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- findTitle(c)
- }
- }
- findTitle(doc)
- if title == "" {
- return "gallery"
- }
- return title
- }
- func extractImageLinks(htmlContent string) []string {
- var links []string
- matches := imgLinkRegex.FindAllStringSubmatch(htmlContent, -1)
- for _, match := range matches {
- if len(match) > 1 {
- links = append(links, match[1])
- }
- }
- return links
- }
- // -------------------- 画廊爬取 --------------------
- func crawlSingleGallery(client *http.Client, sem *semaphore.Weighted, galleryURL string, stats *CrawlStats) GalleryResult {
- // 获取信号量
- ctx := context.Background()
- if err := sem.Acquire(ctx, 1); err != nil {
- return GalleryResult{URL: galleryURL, OK: false, Error: err}
- }
- defer sem.Release(1)
- // 解析基础URL和key
- baseURL := strings.TrimRight(galleryURL, "/")
- parsed, err := url.Parse(baseURL)
- if err != nil {
- return GalleryResult{URL: galleryURL, OK: false, Error: err}
- }
- pathParts := strings.Split(parsed.Path, "/")
- key := pathParts[len(pathParts)-1]
- jsonName := key + ".json"
- var folderPath string
- jsonData := make(map[string]string)
- imgCount := 1
- lastPage := false
- for page := 0; page < MaxPage && !lastPage; page++ {
- pageURL := baseURL
- if page > 0 {
- pageURL = fmt.Sprintf("%s?p=%d", baseURL, page)
- }
- htmlContent, err := fetchPage(client, pageURL)
- if err != nil {
- log.Printf("获取页面失败 %s: %v", pageURL, err)
- continue
- }
- // 提取标题和创建文件夹
- title := extractTitle(htmlContent)
- cleanTitle := cleanFolderName(title)
- folderPath = filepath.Join(DownloadsDir, cleanTitle)
- if err := os.MkdirAll(folderPath, 0755); err != nil {
- return GalleryResult{URL: galleryURL, OK: false, Error: err}
- }
- // 检查JSON文件是否已存在
- jsonPath := filepath.Join(folderPath, jsonName)
- if _, err := os.Stat(jsonPath); err == nil {
- stats.mu.Lock()
- stats.Skipped++
- stats.mu.Unlock()
- log.Printf("%s 已存在,跳过", jsonName)
- return GalleryResult{URL: galleryURL, OK: true}
- }
- log.Printf("当前页码:%d %s", page+1, pageURL)
- // 提取图片链接
- links := extractImageLinks(htmlContent)
- if len(links) == 0 {
- log.Printf("本页无图片入口,视为最后一页")
- lastPage = true
- continue
- }
- // 处理图片链接
- for _, link := range links {
- // 检查是否重复(简单的重复检测)
- isDuplicate := false
- for _, existingLink := range jsonData {
- if existingLink == link {
- isDuplicate = true
- lastPage = true
- break
- }
- }
- if isDuplicate {
- break
- }
- jsonData[fmt.Sprintf("%04d", imgCount)] = link
- imgCount++
- }
- }
- // 保存JSON文件
- if len(jsonData) > 0 {
- jsonPath := filepath.Join(folderPath, jsonName)
- data, err := json.MarshalIndent(jsonData, "", " ")
- if err != nil {
- return GalleryResult{URL: galleryURL, OK: false, Error: err}
- }
- if err := os.WriteFile(jsonPath, data, 0644); err != nil {
- return GalleryResult{URL: galleryURL, OK: false, Error: err}
- }
- log.Printf("保存成功 -> %s (%d 张)", jsonPath, len(jsonData))
- stats.mu.Lock()
- stats.Success++
- stats.mu.Unlock()
- return GalleryResult{URL: galleryURL, OK: true}
- } else {
- log.Printf("%s 未解析到任何图片链接", key)
- stats.mu.Lock()
- stats.Failed++
- stats.mu.Unlock()
- return GalleryResult{URL: galleryURL, OK: false, Error: fmt.Errorf("未解析到图片链接")}
- }
- }
- // -------------------- 主流程 --------------------
- func UrlDownloader(ip, port string, output *widget.Entry) {
- log.SetFlags(log.LstdFlags | log.Lshortfile)
- // 确保下载目录存在
- if err := ensureDownloadsDir(); err != nil {
- log.Fatalf("创建下载目录失败: %v", err)
- }
- // 加载目标URL
- targets, err := loadTargets()
- if err != nil {
- log.Fatal(err)
- }
- // 加载失败记录
- failed, err := loadFailedUrl()
- if err != nil {
- log.Printf("加载失败记录失败: %v", err)
- failed = []string{}
- }
- // 合并URL列表(去重)
- allURLs := make([]string, 0)
- seen := make(map[string]bool)
- // 优先添加失败记录
- if len(failed) > 0 {
- log.Printf("优先重试上次失败画廊: %d 个", len(failed))
- for _, url := range failed {
- if !seen[url] {
- allURLs = append(allURLs, url)
- seen[url] = true
- }
- }
- }
- // 添加新目标
- for _, url := range targets {
- if !seen[url] {
- allURLs = append(allURLs, url)
- seen[url] = true
- }
- }
- if len(allURLs) == 0 {
- log.Println("没有需要处理的URL")
- return
- }
- log.Printf("开始处理 %d 个画廊", len(allURLs))
- // 创建HTTP客户端
- proxy := ip + port
- client := createHTTPClient(proxy)
- // 创建信号量控制并发
- sem := semaphore.NewWeighted(int64(Concurrency))
- stats := &CrawlStats{Total: len(allURLs)}
- // 使用WaitGroup等待所有任务完成
- var wg sync.WaitGroup
- results := make(chan GalleryResult, len(allURLs))
- // 启动所有爬取任务
- for _, galleryURL := range allURLs {
- wg.Add(1)
- go func(url string) {
- defer wg.Done()
- result := crawlSingleGallery(client, sem, url, stats)
- results <- result
- }(galleryURL)
- }
- // 等待所有任务完成
- wg.Wait()
- close(results)
- // 收集失败结果
- var newFailed []string
- for result := range results {
- if !result.OK {
- newFailed = append(newFailed, result.URL)
- log.Printf("画廊处理失败 %s: %v", result.URL, result.Error)
- }
- }
- // 处理失败记录
- if len(newFailed) > 0 {
- if err := saveFailedUrl(newFailed); err != nil {
- log.Printf("保存失败记录失败: %v", err)
- } else {
- log.Printf("本轮仍有 %d 个画廊失败,已写入 %s", len(newFailed), FailedRecordUrl)
- }
- } else {
- // 删除失败记录文件
- if err := os.Remove(FailedRecordUrl); err != nil && !os.IsNotExist(err) {
- log.Printf("删除失败记录文件失败: %v", err)
- } else {
- log.Println("全部画廊抓取完成!")
- }
- }
- // 输出统计信息
- log.Printf("统计信息: 总计=%d, 成功=%d, 失败=%d, 跳过=%d",
- stats.Total, stats.Success, stats.Failed, stats.Skipped)
- }
|