代码之家  ›  专栏  ›  技术社区  ›  dopatraman

通过递归调用时不运行go例程

go
  •  -2
  • dopatraman  · 技术社区  · 7 年前

    我在做 Web Crawler 围棋之旅的问题。到目前为止,我的解决方案是:

    func GatherUrls(url string, fetcher Fetcher) []string {
        body, urls, err := fetcher.Fetch(url)
        if err != nil {
            fmt.Println("error:", err)
        } else {
            fmt.Printf("found: %s %q\n", url, body)
        }
        return urls
    }
    
    // Crawl uses fetcher to recursively crawl
    // pages starting with url, to a maximum of depth.
    func Crawl(url string, depth int, fetcher Fetcher) {
        // get all urls for depth
        // check if url has been crawled
        //  Y: noop
        //  N: crawl url
        // when depth is 0, stop
        fmt.Printf("crawling %q...\n", url)
        if depth <= 0 {
            return
        }
        urls := GatherUrls(url, fetcher)
        fmt.Println("urls:", urls)
        for _, u := range urls {
            fmt.Println("currentUrl:", u)
            if _, exists := cache[u]; !exists {
                fmt.Printf("about to crawl %q\n", u)
                go Crawl(u, depth - 1, fetcher)
            } else {
                cache[u] = true
            }
        }
    }
    
    func main() {
        cache = make(map[string]bool)
        Crawl("https://golang.org/", 4, fetcher)
    }
    

    当我运行这个代码时, Crawl() 函数递归时从不调用(我知道这是因为 fmt.Printf("crawling %q...\n", url) 只打过一次电话)

    以下是日志:

    crawling "https://golang.org/"...
    found: https://golang.org/ "The Go Programming Language"
    urls: [https://golang.org/pkg/ https://golang.org/cmd/]
    currentUrl: https://golang.org/pkg/
    about to crawl "https://golang.org/pkg/"
    currentUrl: https://golang.org/cmd/
    about to crawl "https://golang.org/cmd/"
    

    我做错什么了?我怀疑生成一个线程来进行递归是错误的做法?请告知。

    请注意 我想用尽可能少的库来实现这一点。我看到一些答案 WaitGroup 包裹。我不想用这个。

    注:包括课程样板的完整代码如下: 包装主体

    import (
        "fmt"
    )
    
    var cache map[string]bool
    
    type Fetcher interface {
        // Fetch returns the body of URL and
        // a slice of URLs found on that page.
        Fetch(url string) (body string, urls []string, err error)
    }
    
    func GatherUrls(url string, fetcher Fetcher) []string {
        body, urls, err := fetcher.Fetch(url)
        if err != nil {
            fmt.Println("error:", err)
        } else {
            fmt.Printf("found: %s %q\n", url, body)
        }
        return urls
    }
    
    // Crawl uses fetcher to recursively crawl
    // pages starting with url, to a maximum of depth.
    func Crawl(url string, depth int, fetcher Fetcher) {
        // get all urls for depth
        // check if url has been crawled
        //  Y: noop
        //  N: crawl url
        // when depth is 0, stop
        fmt.Printf("crawling %q...\n", url)
        if depth <= 0 {
            return
        }
        urls := GatherUrls(url, fetcher)
        fmt.Println("urls:", urls)
        for _, u := range urls {
            fmt.Println("currentUrl:", u)
            if _, exists := cache[u]; !exists {
                fmt.Printf("about to crawl %q\n", u)
                go Crawl(u, depth - 1, fetcher)
            } else {
                cache[u] = true
            }
        }
    }
    
    func main() {
        cache = make(map[string]bool)
        Crawl("https://golang.org/", 4, fetcher)
    }
    
    // fakeFetcher is Fetcher that returns canned results.
    type fakeFetcher map[string]*fakeResult
    
    type fakeResult struct {
        body string
        urls []string
    }
    
    func (f fakeFetcher) Fetch(url string) (string, []string, error) {
        if res, ok := f[url]; ok {
            return res.body, res.urls, nil
        }
        return "", nil, fmt.Errorf("not found: %s", url)
    }
    
    // fetcher is a populated fakeFetcher.
    var fetcher = fakeFetcher{
        "https://golang.org/": &fakeResult{
            "The Go Programming Language",
            []string{
                "https://golang.org/pkg/",
                "https://golang.org/cmd/",
            },
        },
        "https://golang.org/pkg/": &fakeResult{
            "Packages",
            []string{
                "https://golang.org/",
                "https://golang.org/cmd/",
                "https://golang.org/pkg/fmt/",
                "https://golang.org/pkg/os/",
            },
        },
        "https://golang.org/pkg/fmt/": &fakeResult{
            "Package fmt",
            []string{
                "https://golang.org/",
                "https://golang.org/pkg/",
            },
        },
        "https://golang.org/pkg/os/": &fakeResult{
            "Package os",
            []string{
                "https://golang.org/",
                "https://golang.org/pkg/",
            },
        },
    }
    
    3 回复  |  直到 7 年前
        1
  •  0
  •   ThanhHH    7 年前

    正如您在本示例中看到的: https://tour.golang.org/concurrency/10 ,我们应该做以下工作:

    • 并行获取URL。
    • 不要获取同一个url两次。
    • 缓存已在地图上获取的URL,但单独使用地图并不安全!

    因此,我们可以采取以下步骤来解决以上任务:

    创建结构以存储提取结果:

    type Result struct {
        body string
        urls []string
        err  error
    }
    

    创建一个结构以存储已在映射上获取的URL,我们需要使用 sync.Mutex ,这不是在“围棋之旅”中介绍的:

    type Cache struct {
        store map[string]bool
        mux   sync.Mutex
    }
    

    并行获取url和body:在获取时将url添加到缓存中,但首先我们需要一个互斥锁并行读取/写入。所以,我们可以修改 Crawl 功能如下:

    func Crawl(url string, depth int, fetcher Fetcher) {
        if depth <= 0 {
            return
        }
    
        ch := make(chan Result)
    
        go func(url string, res chan Result) {
            body, urls, err := fetcher.Fetch(url)
    
            if err != nil {
                ch <- Result{body, urls, err}
                return
            }
    
            var furls []string
            cache.mux.Lock()
            for _, u := range urls {
                if _, exists := cache.store[u]; !exists {
                    furls = append(furls, u)
                }
                cache.store[u] = true
            }
            cache.mux.Unlock()
    
            ch <- Result{body: body, urls: furls, err: err}
    
        }(url, ch)
    
        res := <-ch
    
        if res.err != nil {
            fmt.Println(res.err)
            return
        }
    
        fmt.Printf("found: %s %q\n", url, res.body)
    
        for _, u := range res.urls {
            Crawl(u, depth-1, fetcher)
        }
    }
    

    您可以查看完整的代码并在操场上运行: https://play.golang.org/p/iY9uBXchx3w

    希望能帮上忙。

        2
  •  0
  •   Cerise Limón    7 年前

    main()函数在goroutines执行之前退出。通过使用 wait group :

    有一场数据竞赛 cache . 用互斥锁保护它。总是设置 cache[u] = true 要访问的URL。

    var wg sync.WaitGroup
    var mu sync.Mutex
    var fetched = map[string]bool{}
    
    func Crawl(url string, depth int, fetcher Fetcher) {
        if depth <= 0 {
            return
        }
        body, urls, err := fetcher.Fetch(url)
        if err != nil {
            fmt.Println(err)
            return
        }
        fmt.Printf("found: %s %q\n", url, body)
        for _, u := range urls {
            mu.Lock()
            f := fetched[u]
            fetched[u] = true
            mu.Unlock()
            if !f {
                wg.Add(1)
                go func(u string) {
                    defer wg.Done()
                    Crawl(u, depth-1, fetcher)
                }(u)
            }
        }
        return
    }
    

    playground example

    Wait groups 是等待goroutines完成的惯用方法。如果你不能使用 sync.WaitGroup 出于某种原因,然后使用计数器、互斥锁和通道重新实现该类型:

    type WaitGroup struct {
        mu   sync.Mutex
        n    int
        done chan struct{}
    }
    
    func (wg *WaitGroup) Add(i int) {
        wg.mu.Lock()
        defer wg.mu.Unlock()
        if wg.done == nil {
            wg.done = make(chan struct{})
        }
        wg.n += i
        if wg.n < 0 {
            panic("negative count")
        }
        if wg.n == 0 {
            close(wg.done)
            wg.done = nil
        }
    }
    
    func (wg *WaitGroup) Done() {
        wg.Add(-1)
    }
    
    func (wg *WaitGroup) Wait() {
        wg.mu.Lock()
        done := wg.done
        wg.mu.Unlock()
        if done != nil {
            <-done
        }
    }
    

    playground example

        3
  •  -1
  •   big pigeon    7 年前

    因为主要功能是退出

    你需要添加 sync.WaitGroup 保持主功能等待单元全部协同工作完成

    package main
    
    import (
        "fmt"
        "sync"
    )
    
    var cache map[string]bool
    
    var wg sync.WaitGroup
    
    type Fetcher interface {
        // Fetch returns the body of URL and
        // a slice of URLs found on that page.
        Fetch(url string) (body string, urls []string, err error)
    }
    
    func GatherUrls(url string, fetcher Fetcher, Urls chan []string) {
        body, urls, err := fetcher.Fetch(url)
        if err != nil {
            fmt.Println("error:", err)
        } else {
            fmt.Printf("found: %s %q\n", url, body)
        }
        Urls <- urls
        wg.Done()
    }
    
    // Crawl uses fetcher to recursively crawl
    // pages starting with url, to a maximum of depth.
    func Crawl(url string, depth int, fetcher Fetcher) {
        // get all urls for depth
        // check if url has been crawled
        //  Y: noop
        //  N: crawl url
        // when depth is 0, stop
        fmt.Printf("crawling %q... %d\n", url, depth)
        if depth <= 0 {
            return
        }
        uc := make(chan []string)
        wg.Add(1)
        go GatherUrls(url, fetcher, uc)
        urls, _ := <-uc
        fmt.Println("urls:", urls)
        for _, u := range urls {
            fmt.Println("currentUrl:", u)
            if _, exists := cache[u]; !exists {
                fmt.Printf("about to crawl %q\n", u)
                wg.Add(1)
                go Crawl(u, depth-1, fetcher)
            } else {
                cache[u] = true
            }
        }
        wg.Done()
    }
    
    func main() {
        cache = make(map[string]bool)
        wg.Add(1)
        go Crawl("https://golang.org/", 4, fetcher)
        wg.Wait()
    }
    
    // fakeFetcher is Fetcher that returns canned results.
    type fakeFetcher map[string]*fakeResult
    
    type fakeResult struct {
        body string
        urls []string
    }
    
    func (f fakeFetcher) Fetch(url string) (string, []string, error) {
        if res, ok := f[url]; ok {
            return res.body, res.urls, nil
        }
        return "", nil, fmt.Errorf("not found: %s", url)
    }
    
    // fetcher is a populated fakeFetcher.
    var fetcher = fakeFetcher{
        "https://golang.org/": &fakeResult{
            "The Go Programming Language",
            []string{
                "https://golang.org/pkg/",
                "https://golang.org/cmd/",
            },
        },
        "https://golang.org/pkg/": &fakeResult{
            "Packages",
            []string{
                "https://golang.org/",
                "https://golang.org/cmd/",
                "https://golang.org/pkg/fmt/",
                "https://golang.org/pkg/os/",
            },
        },
        "https://golang.org/pkg/fmt/": &fakeResult{
            "Package fmt",
            []string{
                "https://golang.org/",
                "https://golang.org/pkg/",
            },
        },
        "https://golang.org/pkg/os/": &fakeResult{
            "Package os",
            []string{
                "https://golang.org/",
                "https://golang.org/pkg/",
            },
        },
    }