网站邮箱email地址定向采集核心代码分享
2016 年 7 月 4 日
邮箱采集demo: http://www.jsanai.com/emailco…
原理:
1、根据要采集的url地址,获取页面html内容,然后采用正则匹配出页面的url列表、邮箱地址列表。
2、获取到url列表及邮箱后分两个异步线程:
①保存邮箱地址;
②分析采集子页面url的邮箱地址;
核心源码(golang):
//采集网站地址入口方法 func CollectEmail(hosturl string) (EmailObj, []string, error) { emailObj := new(EmailObj) var inhost []string //获取主域名 uparse, err := url.Parse(hosturl) if err != nil { return *emailObj, inhost, err } emailObj.Surl = hosturl // bodystr, err := HttpGetV2(hosturl) if err != nil { return *emailObj, inhost, errors.New("get request error") } //是否是gbk编码 pos := strings.Index(bodystr, "charset=gb") pos2 := strings.Index(bodystr, "bg2312") if pos != -1 || pos2 != -1 { decodeBytes, err := simplifiedchinese.GB18030.NewDecoder().Bytes([]byte(bodystr)) if err != nil { return *emailObj, inhost, errors.New("simplifiedchinese coding change error") } bodystr = string(decodeBytes) } //获取邮箱地地址 emailObj.Emails = append(emailObj.Emails, matchEmail(bodystr)...) //获取联系手机 emailObj.Phones = append(emailObj.Phones, matchPhone(bodystr)...) //获取内页链接列表 matchUrls := matchUrls(bodystr) for _, item := range matchUrls { itemparse, err := url.Parse(item) if err != nil { continue } if strings.Index(itemparse.Path, ".js") != -1 || strings.Index(itemparse.Path, ".css") != -1 { continue } if itemparse.Host == uparse.Host { inhost = append(inhost, item) } if itemparse.Scheme != "http" && itemparse.Scheme != "https" { if strings.Index(itemparse.Path, "/") == 0 { inhost = append(inhost, uparse.Scheme+"://"+uparse.Host+itemparse.Path) } else { inhost = append(inhost, uparse.Scheme+"://"+uparse.Host+"/"+itemparse.Path) } continue } } //获取内页email inhost = RemoveRepeatedElement(inhost) emailObj.Emails = RemoveRepeatedElement(emailObj.Emails) return *emailObj, inhost, nil } func matchEmail(str string) (email []string) { var emailList []string //re, _ := regexp.Compile("\\`) str = re.ReplaceAllString(str, "") //re, _ = regexp.Compile("\\`) str = re.ReplaceAllString(str, "") //替换html标签 re, _ = regexp.Compile(`<[^>]*?>`) str = re.ReplaceAllString(str, "") //只匹配com com.cn cn org org.cn net reg := regexp.MustCompile(`\w+[@|#]{1}\w+\.(com|cn|org|net|org\.cn|com\.cn)`) match := reg.FindAllStringSubmatch(str, -1) for _, matched := range match { emailList = append(emailList, strings.Replace(strings.ToLower(matched[0]), "#", "@", -1)) } return emailList[:] } func matchUrls(str string) (urls []string) { var urlList []string reg := regexp.MustCompile("]*?href=[\"|']+([^\"]*?)[\"|'][^>]*?>[^<]*?") match := reg.FindAllStringSubmatch(str, -1) for _, matched := range match { urlList = append(urlList, matched[1]) } return urlList[:] }
核心代码使用golang实现,有近6个月的实际使用及改进时间,请放心。
其中涉及到爬虫相关内容,由于当前大部分网站都有反爬虫协议,请大家在使用的时候多加注意