// 批量抓取 funcBatchBasicCrawl() { // 从排行榜中获取code,并过滤已经爬取过的code basicFundList := dao.FilterBasicFund() total := len(basicFundList) if total > 0 { var baseRowsChannel = make(chan entity.FundBasis, total) // 分组抓取 crawlByGroup(basicFundList, baseRowsChannel) // 遍历channel获取数据 var fundBasisRows []entity.FundBasis for item := range baseRowsChannel { fundBasisRows = append(fundBasisRows, item) } if fundBasisRows != nil { // 保存入库 create := global.GvaMysqlClient.Create(fundBasisRows) if create.Error != nil { global.GvaLogger.Sugar().Errorf("基金详情入库失败", create.Error) return } global.GvaLogger.Sugar().Infof("基金详情抓取成功,共: %v 条", create.RowsAffected) } } }
4.2 过滤有详情code(dao.FilterBasicFund)
// 查询没有详情的基金信息 funcFilterBasicFund() []FilterBasicResult { res := []FilterBasicResult{} global.GvaMysqlClient.Raw("SELECT A.fund_code,B.`code` from fas_fund_day_top as A LEFT JOIN fas_fund_basis as B on A.fund_code = B.`code` WHERE B.`code` is NULL GROUP BY A.fund_code").Scan(&res) return res }
4.3 分组抓取函数(crawlByGroup)
// 分组抓取,防止并发过大,被拒绝访问 funccrawlByGroup(basicResults []dao.FilterBasicResult, c chan<- entity.FundBasis) { // 分组抓取 groupNum := 15 fundCodeGroup := splitFundBasicList(basicResults, groupNum) // 并发请求抓取 var wg sync.WaitGroup wg.Add(groupNum) for _, results := range fundCodeGroup { basicFundList := results gofunc() { for _, item := range basicFundList { filterBasicResult := item f := BasisCrawl{} // 爬取页面信息 f.CrawlHtml(filterBasicResult.FundCode) if f.Code != "" { // 转成实体类型 toEntity := f.ConvertToEntity() c <- toEntity } } wg.Done() }() } wg.Wait() // 关闭通道 close(c) }