代码之家 › 专栏 › 技术社区 › dopatraman

通过递归调用时不运行go例程

-2

dopatraman · 技术社区 · 7 年前

我在做 Web Crawler 围棋之旅的问题。到目前为止,我的解决方案是:

func GatherUrls(url string, fetcher Fetcher) []string {
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println("error:", err)
    } else {
        fmt.Printf("found: %s %q\n", url, body)
    }
    return urls
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // get all urls for depth
    // check if url has been crawled
    //  Y: noop
    //  N: crawl url
    // when depth is 0, stop
    fmt.Printf("crawling %q...\n", url)
    if depth <= 0 {
        return
    }
    urls := GatherUrls(url, fetcher)
    fmt.Println("urls:", urls)
    for _, u := range urls {
        fmt.Println("currentUrl:", u)
        if _, exists := cache[u]; !exists {
            fmt.Printf("about to crawl %q\n", u)
            go Crawl(u, depth - 1, fetcher)
        } else {
            cache[u] = true
        }
    }
}

func main() {
    cache = make(map[string]bool)
    Crawl("https://golang.org/", 4, fetcher)
}

当我运行这个代码时, Crawl() 函数递归时从不调用(我知道这是因为 fmt.Printf("crawling %q...\n", url) 只打过一次电话)

以下是日志:

crawling "https://golang.org/"...
found: https://golang.org/ "The Go Programming Language"
urls: [https://golang.org/pkg/ https://golang.org/cmd/]
currentUrl: https://golang.org/pkg/
about to crawl "https://golang.org/pkg/"
currentUrl: https://golang.org/cmd/
about to crawl "https://golang.org/cmd/"

我做错什么了?我怀疑生成一个线程来进行递归是错误的做法?请告知。

请注意 我想用尽可能少的库来实现这一点。我看到一些答案 WaitGroup 包裹。我不想用这个。

注:包括课程样板的完整代码如下: 包装主体

import (
    "fmt"
)

var cache map[string]bool

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

func GatherUrls(url string, fetcher Fetcher) []string {
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println("error:", err)
    } else {
        fmt.Printf("found: %s %q\n", url, body)
    }
    return urls
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // get all urls for depth
    // check if url has been crawled
    //  Y: noop
    //  N: crawl url
    // when depth is 0, stop
    fmt.Printf("crawling %q...\n", url)
    if depth <= 0 {
        return
    }
    urls := GatherUrls(url, fetcher)
    fmt.Println("urls:", urls)
    for _, u := range urls {
        fmt.Println("currentUrl:", u)
        if _, exists := cache[u]; !exists {
            fmt.Printf("about to crawl %q\n", u)
            go Crawl(u, depth - 1, fetcher)
        } else {
            cache[u] = true
        }
    }
}

func main() {
    cache = make(map[string]bool)
    Crawl("https://golang.org/", 4, fetcher)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}

3 回复 | 直到 7 年前

ThanhHH 7 年前

正如您在本示例中看到的: https://tour.golang.org/concurrency/10 ,我们应该做以下工作:

并行获取URL。
不要获取同一个url两次。
缓存已在地图上获取的URL,但单独使用地图并不安全!

因此,我们可以采取以下步骤来解决以上任务:

创建结构以存储提取结果:

type Result struct {
    body string
    urls []string
    err  error
}

创建一个结构以存储已在映射上获取的URL,我们需要使用 sync.Mutex ,这不是在“围棋之旅”中介绍的:

type Cache struct {
    store map[string]bool
    mux   sync.Mutex
}

并行获取url和body:在获取时将url添加到缓存中,但首先我们需要一个互斥锁并行读取/写入。所以,我们可以修改 Crawl 功能如下:

func Crawl(url string, depth int, fetcher Fetcher) {
    if depth <= 0 {
        return
    }

    ch := make(chan Result)

    go func(url string, res chan Result) {
        body, urls, err := fetcher.Fetch(url)

        if err != nil {
            ch <- Result{body, urls, err}
            return
        }

        var furls []string
        cache.mux.Lock()
        for _, u := range urls {
            if _, exists := cache.store[u]; !exists {
                furls = append(furls, u)
            }
            cache.store[u] = true
        }
        cache.mux.Unlock()

        ch <- Result{body: body, urls: furls, err: err}

    }(url, ch)

    res := <-ch

    if res.err != nil {
        fmt.Println(res.err)
        return
    }

    fmt.Printf("found: %s %q\n", url, res.body)

    for _, u := range res.urls {
        Crawl(u, depth-1, fetcher)
    }
}

您可以查看完整的代码并在操场上运行: https://play.golang.org/p/iY9uBXchx3w

希望能帮上忙。

Cerise Limón 7 年前

main()函数在goroutines执行之前退出。通过使用 wait group :

有一场数据竞赛 cache . 用互斥锁保护它。总是设置 cache[u] = true 要访问的URL。

var wg sync.WaitGroup
var mu sync.Mutex
var fetched = map[string]bool{}

func Crawl(url string, depth int, fetcher Fetcher) {
    if depth <= 0 {
        return
    }
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Printf("found: %s %q\n", url, body)
    for _, u := range urls {
        mu.Lock()
        f := fetched[u]
        fetched[u] = true
        mu.Unlock()
        if !f {
            wg.Add(1)
            go func(u string) {
                defer wg.Done()
                Crawl(u, depth-1, fetcher)
            }(u)
        }
    }
    return
}

playground example

Wait groups 是等待goroutines完成的惯用方法。如果你不能使用 sync.WaitGroup 出于某种原因,然后使用计数器、互斥锁和通道重新实现该类型:

type WaitGroup struct {
    mu   sync.Mutex
    n    int
    done chan struct{}
}

func (wg *WaitGroup) Add(i int) {
    wg.mu.Lock()
    defer wg.mu.Unlock()
    if wg.done == nil {
        wg.done = make(chan struct{})
    }
    wg.n += i
    if wg.n < 0 {
        panic("negative count")
    }
    if wg.n == 0 {
        close(wg.done)
        wg.done = nil
    }
}

func (wg *WaitGroup) Done() {
    wg.Add(-1)
}

func (wg *WaitGroup) Wait() {
    wg.mu.Lock()
    done := wg.done
    wg.mu.Unlock()
    if done != nil {
        <-done
    }
}

playground example

-1

big pigeon 7 年前

因为主要功能是退出

你需要添加 sync.WaitGroup 保持主功能等待单元全部协同工作完成

package main

import (
    "fmt"
    "sync"
)

var cache map[string]bool

var wg sync.WaitGroup

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

func GatherUrls(url string, fetcher Fetcher, Urls chan []string) {
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println("error:", err)
    } else {
        fmt.Printf("found: %s %q\n", url, body)
    }
    Urls <- urls
    wg.Done()
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // get all urls for depth
    // check if url has been crawled
    //  Y: noop
    //  N: crawl url
    // when depth is 0, stop
    fmt.Printf("crawling %q... %d\n", url, depth)
    if depth <= 0 {
        return
    }
    uc := make(chan []string)
    wg.Add(1)
    go GatherUrls(url, fetcher, uc)
    urls, _ := <-uc
    fmt.Println("urls:", urls)
    for _, u := range urls {
        fmt.Println("currentUrl:", u)
        if _, exists := cache[u]; !exists {
            fmt.Printf("about to crawl %q\n", u)
            wg.Add(1)
            go Crawl(u, depth-1, fetcher)
        } else {
            cache[u] = true
        }
    }
    wg.Done()
}

func main() {
    cache = make(map[string]bool)
    wg.Add(1)
    go Crawl("https://golang.org/", 4, fetcher)
    wg.Wait()
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}