I've made a program that scrapes all the pages of a website using goroutines:
func main() {
start := time.Now()
knownUrls := getKnownURLs(os.Getenv("SITEMAP_URL"))
var wg sync.WaitGroup
for index, url := range knownUrls {
wg.Add(1)
fmt.Printf("%d/%d\n", index+1, len(knownUrls))
go func() {
if err := indexArticleFromURL(url, client); err != nil {
log.Fatalf("Error indexing doc: %s", err)
}
wg.Done()
}()
}
wg.Wait()
elapsed := time.Since(start)
fmt.Printf("Took %s", elapsed)
}
This works shockingly fast, 5.9s for a thousand pages to be exact. But it bothers me that if a website has thousands of pages it will create thousands of goroutines.
So I refactored it with a package called semaphore. From what I understand it should limit the amount of goroutines to what the processor can handle. Shouldn't decrease performance, since the program above already physically could not use more threads than the CPU can provide.
func main() {
start := time.Now()
ctx := context.Background()
knownUrls := getKnownURLs(os.Getenv("SITEMAP_URL"))
var (
maxWorkers = runtime.GOMAXPROCS(0)
sem = semaphore.NewWeighted(int64(maxWorkers))
)
for index, url := range knownUrls {
if err := sem.Acquire(ctx, 1); err != nil {
log.Printf("Failed to acquire semaphore: %v", err)
break
}
fmt.Printf("%d/%d\n", index+1, len(knownUrls))
go func() {
if err := indexDocFromURL(url, client); err != nil {
log.Fatalf("Error indexing doc: %s", err)
}
sem.Release(1)
}()
}
if err := sem.Acquire(ctx, int64(maxWorkers)); err != nil {
log.Printf("Failed to acquire semaphore: %v", err)
}
elapsed := time.Since(start)
fmt.Printf("Took %s", elapsed)
}
But now when I run the program it takes significantly more time: 11+ seconds.
Seems like this shouldn't be the case, since runtime.GOMAXPROCS(0) returns the maximum number of CPUs that can be executing simultaneously.
Why is the semaphore version slower? And how do I make it match the performance of the unsafe program, while making sure the number of goroutines will not crash it?