设为首页收藏本站

追梦Linux

 找回密码
 立即注册

QQ登录

只需一步,快速开始

查看: 651|回复: 0

【python爬虫系列】三、爬虫屏蔽及抓包分析

[复制链接]

482

主题

485

帖子

16万

积分

CEO

Rank: 9Rank: 9Rank: 9

积分
169037

最佳新人活跃会员热心会员推广达人宣传达人灌水之王突出贡献优秀版主荣誉管理论坛元老

QQ
发表于 2018-3-15 18:07:35 | 显示全部楼层 |阅读模式
一、使用代理服务器进行网页
使用代理服务器可以解决IP限制的问题
[Python] 纯文本查看 复制代码
'''
添加代理服务器

'''
import urllib.request


def use_proxy(url, proxy_addr):
    proxy = urllib.request.ProxyHandler({'http': proxy_addr})
    opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
    return data


url = 'http://www.baidu.com'
proxy_addr = '120.79.220.136:6666'

print(len(use_proxy(url, proxy_addr)))


爬取淘宝图片
[Python] 纯文本查看 复制代码
'''
淘宝图片爬虫
[url=https://s.taobao.com/list?spm=a21bo.2017.201867-links-0.13.5af911d9FuRsoj&q=t%E6%81%A4&cat=16&style=grid&seller_type=taobao]https://s.taobao.com/list?spm=a2 ... &seller_type=taobao[/url]

[url=https://s.taobao.com/list?spm=a21bo.2017.201867-links-0.13.5af911d9FuRsoj&q=t%E6%81%A4&cat=16&style=grid&seller_type=taobao&bcoffset=12&s=60]https://s.taobao.com/list?spm=a2 ... ao&bcoffset=12&s=60[/url]

[url=https://s.taobao.com/list?spm=a21bo.2017.201867-links-0.13.5af911d9FuRsoj&q=t%E6%81%A4&cat=16&style=grid&seller_type=taobao&bcoffset=12&s=120]https://s.taobao.com/list?spm=a2 ... o&bcoffset=12&s=120[/url]

[url=https://s.taobao.com/list?spm=a21bo.2017.201867-links-0.13.5af911d9FuRsoj&q=t%E6%81%A4&cat=16&style=grid&seller_type=taobao&bcoffset=12&s=300]https://s.taobao.com/list?spm=a2 ... o&bcoffset=12&s=300[/url]

[url=https://g-search1.alicdn.com/img/bao/uploaded/i4/i2/2974167399/TB2eLV8jL6H8KJjy0FjXXaXepXa_!!2974167399.jpg_230x230.jpg_.webp]https://g-search1.alicdn.com/img ... g_230x230.jpg_.webp[/url]

"picUrl":"//g-search2.alicdn.com/img/bao/uploaded/i4/i4/2944054187/TB2fcLPicnI8KJjSsziXXb8QpXa_!!2944054187.jpg"

user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3346.9 Safari/537.36

[url=https://s.taobao.com/list?spm=a21bo.2017.201867-links-0.13.5af911d9FuRsoj&q=t%E6%81%A4&cat=16&style=grid&seller_type=taobao&bcoffset=12&s=300&smToken=7171e065119d42be89f788379c793492&smSign=ypDyr3Lcu2k%2BYT%2BKA3G90w%3D%3D]https://s.taobao.com/list?spm=a2 ... BYT%2BKA3G90w%3D%3D[/url]


'''

import urllib.request
import re

keyname = '短裙'
key = urllib.request.quote(keyname)



headers = ("user-agent",
           "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3346.9 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
for i in range(1, 3):
    # url = 'https://s.taobao.com/list?spm=a21bo.2017.201867-links-0.13.5af911d9FuRsoj&q='+key+'&cat=16&style=grid&seller_type=taobao&bcoffset=12&s='+str(i * 60)+'&smToken=1236a763964147a189148e0a67fdde14&smSign=F%2B9eQMVM3kl6jhWuVzCDJg%3D%3D'
    url = 'https://s.taobao.com/search?q=裙子&s=' + str(i * 44)
    print(url)
    # data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
    data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
    pat = 'picUrl":"//(.*?)"'
    print(data)
    img_list = re.compile(pat).findall(data)
    print(img_list)
    for j in range(0, len(img_list)):
        this_img = img_list[j]
        this_img_url = 'http://%s' % this_img
        file = 'img/%s%s.jpg' % (i, j)
        urllib.request.urlretrieve(this_img_url, file)


#####################
爬千图网某个频道高清图
[Python] 纯文本查看 复制代码
'''
爬千图网某个频道高清图
[url=http://www.58pic.com/]http://www.58pic.com/[/url]
'''

import re
import urllib.request

headers = ("User-Agent",
           "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3346.9 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
pat = '<img class="lazyload" data-original="(.*?)" src=.*? title="(.*?)"'
for i in range(1, 100):
    url = 'http://www.58pic.com/piccate/5-138-0-%s.html' % str(i)
    data = urllib.request.urlopen(url).read().decode('gbk', 'ignore')

    img_info = re.compile(pat).findall(data)
    for j in img_info:
        img_url = j[0]
        img_name = j[1]
        print(img_url,img_name)
        file = 'homework_qiantu/%s.jpg' % img_name
        urllib.request.urlretrieve(img_url, file)


################
爬取校花网图片

[Python] 纯文本查看 复制代码
'''
爬取校花网图片
[url=http://www.521609.com/daxuexiaohua/list31.html]http://www.521609.com/daxuexiaohua/list31.html[/url]
'''
import re
import urllib.request
import urllib.error


def get_xiaohua_img(page):
        domain = 'http://www.521609.com/'
        headers = ("User-Agent",
                           "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3346.9 Safari/537.36")
        opener = urllib.request.build_opener()
        opener.addheaders = [headers]
        urllib.request.install_opener(opener)
        pat = '<li><a href="(.*?)"><img src=".*?" width="160" height="220" border="0" alt=".*?"'

        for i in range(1, int(page)+1):
                try:
                        url = 'http://www.521609.com/daxuexiaohua/list3%s.html' % str(i)
                        data = urllib.request.urlopen(url).read().decode('gb2312', 'ignore')
                        html_info = re.compile(pat).findall(data)
                        # print(len(html_info), html_info)
                        pat2 = "<img src='(.*?)' id='.*?'  alt='(.*?)' border='.*?' />"
                        for j in html_info:
                                html_url = '%s%s' % (domain, j)
                                img_data = urllib.request.urlopen(html_url).read().decode('gb2312', 'ignore')
                                img_info = re.compile(pat2).findall(img_data)
                                for k in img_info:
                                        img_url = '%s%s' % (domain, k[0])
                                        img_name = k[1].split('|')[0] or '无名'
                                        file_name = 'xiaohua/%s.jpg' % img_name
                                        urllib.request.urlretrieve(img_url, file_name)
                                        print('第%s页第%s中图片爬取成功' % (i, j))
                except Exception as e:
                        print(e)
                        
get_xiaohua_img(40)                        


###########################
多线程爬取校花网图片

[Python] 纯文本查看 复制代码
'''
爬取校花网图片
多线程
[url=http://www.521609.com/daxuexiaohua/list31.html]http://www.521609.com/daxuexiaohua/list31.html[/url]
'''
import re
import urllib.request
import urllib.error
import threading


def get_xiaohua_img(page, start_num=0):
    domain = 'http://www.521609.com/'
    headers = ("User-Agent",
               "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3346.9 Safari/537.36")
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)
    pat = '<li><a href="(.*?)"><img src=".*?" width="160" height="220" border="0" alt=".*?"'

    for i in range(start_num, int(page) + 1, 2):
        try:
            url = 'http://www.521609.com/daxuexiaohua/list3%s.html' % str(i)
            data = urllib.request.urlopen(url).read().decode('gb2312', 'ignore')
            html_info = re.compile(pat).findall(data)
            # print(len(html_info), html_info)
            pat2 = "<img src='(.*?)' id='.*?'  alt='(.*?)' border='.*?' />"
            for j in html_info:
                html_url = '%s%s' % (domain, j)
                img_data = urllib.request.urlopen(html_url).read().decode('gb2312', 'ignore')
                img_info = re.compile(pat2).findall(img_data)
                for k in img_info:
                    img_url = '%s%s' % (domain, k[0])
                    img_name = k[1].split('|')[0] or '无名'
                    file_name = 'xiaohua/%s.jpg' % img_name
                    urllib.request.urlretrieve(img_url, file_name)
                    print('第%s页第%s中图片爬取成功' % (i, j))
        except Exception as e:
            print(e)


class Odd(threading.Thread):
    '''
    爬取奇数页
    '''

    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        get_xiaohua_img(50, 1)


class Even(threading.Thread):
    '''
    爬取偶数页
    '''

    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        get_xiaohua_img(50, 0)


odd = Odd()
odd.start()
even = Even()
even.start()

二、抓包分析
所谓抓包分析,即将网络传输发送与接收的数据包进行抓取操作,做爬虫时候数据不一定在html源代码中,很可能隐藏在一些其他地址中,所以我们需要抓取某些数据,就需要进行抓包,分析出对应所隐藏在的网址,然后分析规律并爬取。
常用抓包工具Fiddler
Fiddler默认只抓取HTTP协议数据,抓不到HTTPS数据,如果需要抓取HTTPS需要进行对应设置。
配置finddler 配置https==》Tools==》Options==》HTTPS==》左边全勾==》action==》导出证书到桌面==》到火狐浏览器==》导入证书

################
爬取腾讯视频评价
[Python] 纯文本查看 复制代码
'''
淘宝评论
[url=https://rate.taobao.com/feedRateList.htm?auctionNumId=548434073715&userNumId=2413926519]https://rate.taobao.com/feedRate ... serNumId=2413926519[/url]¤tPageNum=1&pageSize=20&rateType=&orderType=sort_weight&attribute=&sku=&hasSku=false&folded=0&ua=098%23E1hvxvvUvbpvUvCkvvvvvjiPPFsWAj3WPF5vAjthPmPZAjrER2d9ljDvRFFpAj182QhvCvvvMMGCvpvVvvBvpvvvmphvLU8oUkUaT2eARdIAcUmxdBAK5kx%2Fgj7xhLIBfv7xYCeOHF%2BSBiVvVbT%2FHmHL5a70%2BFwTW1%2FDDXJ4P33iHP7n%2B2u1nV9VD4oYSd8rJm7ivpvUvvCCURrXKOVEvpvV9pCmpYL9Kphv8vvvphvvvvvvvvCHhQvvvHZvvhZLvvmCvvvvBBWvvvH%2BvvCHUQvvvcptvpvhvvCvp8wCvvpvvhHh&_ksTS=1520323426233_1438&callback=jsonp_tbcrate_reviews_list


[url=https://v.qq.com/x/cover/h05z5bsjxw544er.html]https://v.qq.com/x/cover/h05z5bsjxw544er.html[/url]
[url=https://video.coral.qq.com/filmreviewr/c/upcomment/h05z5bsjxw544er?callback=_filmreviewrcupcommenth05z5bsjxw544er&reqnum=3&_=1520325041787]https://video.coral.qq.com/filmr ... m=3&_=1520325041787[/url]
爬取腾讯视频评论
'''

import re
import urllib.request
import urllib.error
import ssl

context = ssl._create_unverified_context()

headers = ("User-Agent",
           "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3346.9 Safari/537.36")

opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
comid = '6354878324451979524'
url = 'https://video.coral.qq.com/filmreviewr/c/upcomment/h05z5bsjxw544er?commentid=' + comid + "callback=_filmreviewrcupcommenth05z5bsjxw544er&reqnum=3&_=1520325041787"
for i in range(0, 10):
    data = urllib.request.urlopen(url, context=context).read().decode()
    pat_next = '"last":"(.*?)"'
    next_id = re.compile(pat_next).findall(data)[0]
    pat_content = '"content":"(.*?)",'
    content_data = re.compile(pat_content).findall(data)
    for j in range(0,len(content_data)):
        print('----第%s%s条评论是:' %(i,j))
        print(eval('u"'+content_data[j]+'"'))
    url = 'https://video.coral.qq.com/filmreviewr/c/upcomment/h05z5bsjxw544er?commentid=' + next_id + "callback=_filmreviewrcupcommenth05z5bsjxw544er&reqnum=3&_=1520325041787"
    print(url)


多线程爬虫:程序中某些进程并行执行,合理设置多线程可以提高爬虫效率
#########
单线程爬取糗事百科段子

[Python] 纯文本查看 复制代码
'''
单线程爬虫
爬取糗事百科段子
[url=https://www.qiushibaike.com/8hr/page/4/]https://www.qiushibaike.com/8hr/page/4/[/url]
'''

import re
import urllib.request
import urllib.error
import ssl

context = ssl._create_unverified_context()

headers = ("User-Agent",
           "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3346.9 Safari/537.36")

opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)

for i in range(1, 20):
    url = 'https://www.qiushibaike.com/8hr/page/%s/' % i
    page_data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
    pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
    data_list = re.compile(pat, re.S).findall(page_data)
    for j in range(0, len(data_list)):
        print('第%s页第%s个段子:' % (i, j))
        print(data_list[j])


#########
多线程爬取糗事百科段子

[Python] 纯文本查看 复制代码
'''
多线程爬虫
爬取糗事百科段子
[url=https://www.qiushibaike.com/8hr/page/4/]https://www.qiushibaike.com/8hr/page/4/[/url]
'''

import re
import urllib.request
import urllib.error
import ssl
import threading
import time

context = ssl._create_unverified_context()

headers = ("User-Agent",
           "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3346.9 Safari/537.36")

opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)


class One(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        for i in range(1,20, 2):
            try:
                url = 'https://www.qiushibaike.com/8hr/page/%s/' % i
                page_data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
                pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
                data_list = re.compile(pat, re.S).findall(page_data)
                for j in range(0, len(data_list)):
                    print('第%s页第%s个段子:' % (i, j))
                    print(data_list[j])
                    time.sleep(0.1)
            except Exception as e:
                print(e)


class Two(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        for i in range(0, 20, 2):
            try:
                url = 'https://www.qiushibaike.com/8hr/page/%s/' % i
                page_data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
                pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
                data_list = re.compile(pat, re.S).findall(page_data)
                for j in range(0, len(data_list)):
                    print('第%s页第%s个段子:' % (i, j))
                    print(data_list[j].strip())
                    time.sleep(0.1)
            except Exception as e:
                print(e)

t1 = One()
t1.start()

t2 = Two()
t2.start()

三、微信爬虫
自动获取微信文章信息的一种爬虫,微信对我们有很多限制,所以需要一些手段解决这些限制策略,如使用代理IP、伪装浏览器等
[Python] 纯文本查看 复制代码
'''
爬取微信文章
'''

# [url=http://weixin.sogou.com/]http://weixin.sogou.com/[/url]
import re
import urllib.request
import time
import urllib.error
import urllib.request


# 自定义函数,功能为使用代理服务器爬一个网址
def use_proxy(proxy_addr, url):
    # 建立异常处理机制
    try:
        req = urllib.request.Request(url)
        req.add_header('User-Agent',
                       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0')
        proxy = urllib.request.ProxyHandler({'http': proxy_addr})
        opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
        urllib.request.install_opener(opener)
        data = urllib.request.urlopen(req).read()
        return data
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
        # 若为URLError异常,延时10秒执行
        time.sleep(10)
    except Exception as e:
        print("exception:" + str(e))
        # 若为Exception异常,延时1秒执行
        time.sleep(1)


# 设置关键词
key = "Python"
# 设置代理服务器,该代理服务器有可能失效,读者需要换成新的有效代理服务器
proxy = "127.0.0.1:8888"
# 爬多少页
for i in range(0, 1):
    key = urllib.request.quote(key)
    thispageurl = "http://weixin.sogou.com/weixin?type=2&query=" + key + "&page=" + str(i)
    # a="http://blog.csdn.net"
    thispagedata = use_proxy(proxy, thispageurl)
    print(len(str(thispagedata)))
    # pat1 = '<a href="(.*?)"'
    pat1 = '<a target="_blank" href="(.*?)"'
    rs1 = re.compile(pat1, re.S).findall(str(thispagedata))
    if (len(rs1) == 0):
        print("此次(" + str(i) + "页)没成功")
        continue
    for j in range(0, len(rs1)):
        thisurl = rs1[j]
        thisurl = thisurl.replace("amp;", "")
        # file = "F:/天善-Python数据分析与挖掘课程/result/32/第" + str(i) + "页第" + str(j) + "篇文章.html"
        file = "weixin/第" + str(i) + "页第" + str(j) + "篇文章.html"
        thisdata = use_proxy(proxy, thisurl)
        try:
            fh = open(file, "wb")
            fh.write(thisdata)
            fh.close()
            print("第" + str(i) + "页第" + str(j) + "篇文章成功")
        except Exception as e:
            print(e)
            print("第" + str(i) + "页第" + str(j) + "篇文章失败")


小黑屋|手机版|Archiver|追梦Linux    点击这里给我发消息

GMT+8, 2021-6-15 20:22 , Processed in 7.846984 second(s), 31 queries .

Powered by 追梦Linux! X3.3 Licensed

© 2015-2017 追梦Linux!.

快速回复 返回顶部 返回列表