数据解析
数据解析:解析或提取数据,从通用爬虫获取的整张页面中,取得指定的局部数据
正则解析 正则回顾
单字符: . : 除换行以外所有字符 [] :[aoe] [a-w] 匹配集合中任意一个字符 \d :数字 [0-9] \D : 非数字 \w :数字、字母、下划线、中文 \W : 非\w \s :所有的空白字符包,括空格、制表符、换页符等等。等价于 [ \f\n\r\t\v]。 \S : 非空白 数量修饰: * : 任意多次 >=0 + : 至少1次 >=1 ? : 可有可无 0次或者1次 {m} :固定m次 hello{3,} {m,} :至少m次 {m,n} :m-n次 边界: $ : 以某某结尾 ^ : 以某某开头 分组: (ab) 贪婪模式: .* 非贪婪(惰性)模式: .*? re.I : 忽略大小写 re.M :多行匹配 re.S :单行匹配 re.sub(正则表达式, 替换内容, 字符串)
import requests import re headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' } url = 'http://duanziwang.com/category/搞笑图/' #捕获到的是字符串形式的响应数据 page_text = requests.get(url=url,headers=headers).text #数据解析 ex = '.*?(.*?)' ret = re.findall(ex,page_text,re.S)#爬虫使用正则做解析的话re.S必须要使用 #持久化存储 with open("./title.txt","a",encoding="utf-8") as f: for i in ret: f.write(f"{i}\n")
bs4解析
常用的方法和属性
标签定位:根据标签名进行定位,只返回第一个出现的标签
soup.标签名 返回当前源码中的第一个出现的标签名
属性定位:根据指定的属性进行对应标签的定位
soup.find(标签名,标签属性=属性值) 只有class属性加 class_
soup,find("tagName")
soup.findall(标签名,标签属性=属性值)
选择器定位
soup.select(".类")
soup.select("#id")
层级选择器
取文本
标签定位到的标签.string
tag.string
标签定位到的标签.text
tag.text
取属性
标签定位到的标签["字符串形式的属性名称"]
tag.["attrName"]
案例:爬取小说
# 这种网站最好使用代理池 # 案例1 import requests from bs4 import BeautifulSoup headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", } url = "https://www.52bqg.com/book_10508/" fp = open("./九仙图.txt","w",encoding="utf-8") page_text = requests.get(url=url,headers=headers) page_text.encoding="GBK" x = page_text.text soup = BeautifulSoup(x,'lxml') a_list = soup.select('#list a') for i in a_list: title = i.string a_href = 'https://www.52bqg.com/book_10508/' + i['href'] page_text_a = requests.get(url=a_href,headers=headers) page_text_a.encoding="GBK" f = page_text_a.text a_soup = BeautifulSoup(f,'lxml') div_tag = a_soup.find('div',id='content') content = div_tag.text fp.write("\n" + title + "\n" + content + "\n") print(title,"下载完成") fp.close()
# 案例2 import requests from bs4 import BeautifulSoup headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", } url = "http://www.balingtxt.com/txtml_84980.html" page_text = requests.get(url=url,headers=headers) page_text.encoding="utf-8" page_text = page_text.text menu_soup = BeautifulSoup(page_text,"lxml") a_lst = menu_soup.select("#yulan > li > a") # fp = open("./天命相师.txt","w",encoding="utf-8") for i in a_lst: title = i.string a_url = i["href"] new_text = requests.get(url=a_url,headers=headers) new_text.encoding="utf-8" new_text = new_text.text contcent_soup = BeautifulSoup(new_text,"lxml") content = contcent_soup.find("div",class_="book_content").text fp.write(f"{title}\n{content}\n") print(f"{title} 下载完成!") fp.close()
图片数据的爬取
基于requests
import requests headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", } url = "https://s.ynicp.com/Uploads/autouploads/20200319/2_20200319_191026_23gvkxsksoipj.jpg" img_data = requests.get(url=url,headers=headers).content # 返回的是二进制类型的数据 with open("./tiger1.jpg","wb") as f: f.write(img_data)
content 返回的是二进制数据,如爬取图片,音频,视频
基于urllib
from urllib import request url = "https://s.ynicp.com/Uploads/autouploads/20200319/2_20200319_191026_23gvkxsksoipj.jpg" ret = request.urlretrieve(url=url,filename="./trger2.jpg") print(ret)
区别:能否实现UA伪装
Xpath解析
解析原理&流程
实例化对象
XPath表达式
标签定位
最左侧的/:必须冲根节点定位标签(几乎不用)
非最左侧的/:表示一个层级
最左侧的//:可以从任意位置进行指定标签的定位
非最左侧的//:表示多个层级(最常用)
属性定位:"//tagName[@attrName='value']"
"//标签名[@属性名称=‘属性值’]"
//div[start-with(@calss,'ta')] # 所有的div中,class属性值以'ta'开头的
取文本
取属性
案例:Xpath下载图片
import requests import os from lxml import etree from urllib import request headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' } dirName = "imglibs" # 存储图片的文件夹名字 if not os.path.exists(dirName): os.mkdir(dirName) # 没有该文件夹就创建 page_url = "http://pic.netbian.com/4kfengjing/index_%d.html" for page_num in range(1,10): # 全站数据爬取,根据自己情况选择爬取的页码数 if page_num == 1: url = "http://pic.netbian.com/4kfengjing/" else: url = format(page_url%page_num) page_text = requests.get(url=url,headers=headers).text # 数据解析 tree = etree.HTML(page_text) # 实例化一个etree对象 img_lst = tree.xpath('//div[@class="slist"]/ul/li/a') # 返回一个a标签的列表 for i in img_lst: img_href = "http://pic.netbian.com" + i.xpath("./@href")[0] # 路径拼接,找到大图页的地址 img_text = requests.get(url=img_href,headers=headers).text new_tree = etree.HTML(img_text) # 实例化一个新的etree对象 img_list = new_tree.xpath('//a[@id="img"]/img')[0] # 找到图片的img标签对象 img_src = "http://pic.netbian.com" + img_list.xpath('./@src')[0] # 拼接高清大图的地址 img_alt = img_list.xpath('./@alt')[0].encode('iso-8859-1').decode('GBK') # 找到图片的名字 filepath = "./" + dirName + "/" + img_alt + ".jpg" # 加.jpg后缀 request.urlretrieve(img_src,filename=filepath) # 持久化存储 print(img_alt,"下载成功!") print("Over!")
pyquery解析
使用方法
from pyquery import PyQuery as pq doc =pq(html) #解析html字符串 doc =pq("http://news.baidu.com/") #解析网页 doc =pq("./a.html") #解析html 文本
from pyquery import PyQuery as pq html = '''asdasdasdadasdad12312asdadasdad12312asdadasdad12312''' doc = pq(html) print doc("#wrap .s_from link")
运行结果:
asdadasdad12312asdadasdad12312asdadasdad12312
from pyquery import PyQuery as pq html = '''asdasdasdadasdad12312asdadasdad12312asdadasdad12312''' #查找子元素 doc = pq(html) items=doc("#wrap") print(items) print("类型为:%s"%type(items)) link = items.find('.s_from') print(link) link = items.children() print(link)
运行结果:
asdasdasdadasdad12312asdadasdad12312asdadasdad12312类型为:asdasdasdadasdad12312asdadasdad12312asdadasdad12312asdasdasdadasdad12312asdadasdad12312asdadasdad12312
from pyquery import PyQuery as pq html = '''hello nihaoasdasdasdadasdad12312asdadasdad12312asdadasdad12312''' doc = pq(html) items=doc(".s_from") print(items) #查找父元素 parent_href=items.parent() print(parent_href)
运行结果:
asdasdasdadasdad12312asdadasdad12312asdadasdad12312hello nihaoasdasdasdadasdad12312asdadasdad12312asdadasdad12312
parent可以查找出外层标签包括的内容,与之类似的还有parents,可以获取所有外层节点。
from pyquery import PyQuery as pq html = '''hello nihaoasdasdasdadasdad12312asdadasdad12312asdadasdad12312''' doc = pq(html) items=doc("link.active1.a123") print(items) #查找兄弟元素 siblings_href=items.siblings() print(siblings_href)
运行结果:
asdadasdad12312asdadasdad12312asdadasdad12312
根据运行结果可以看出,siblings 返回了同级的其他标签
结论:子元素查找,父元素查找,兄弟元素查找,这些方法返回的结果类型都是pyquery类型,可以针对结果再次进行选择
from pyquery import PyQuery as pq html = '''hello nihaoasdasdasdadasdad12312asdadasdad12312asdadasdad12312''' doc = pq(html) its=doc("link").items() for it in its: print(it)
运行结果:
asdadasdad12312asdadasdad12312asdadasdad12312
from pyquery import PyQuery as pq html = '''hello nihaoasdasdasdadasdad12312asdadasdad12312asdadasdad12312''' doc = pq(html) its=doc("link").items() for it in its: print(it.attr('href')) print(it.attr.href)
运行结果:
http://asda.com http://asda.com http://asda1.com http://asda1.com http://asda2.com http://asda2.com
from pyquery import PyQuery as pq html = '''hello nihaoasdasdasdadasdad12312asdadasdad12312asdadasdad12312''' doc = pq(html) its=doc("link").items() for it in its: print(it.text())
运行结果:
asdadasdad12312 asdadasdad12312 asdadasdad12312
from pyquery import PyQuery as pq html = '''hello nihaoasdasdasdadasdad12312asdadasdad12312asdadasdad12312''' doc = pq(html) its=doc("link").items() for it in its: print(it.html())
运行结果:
asdadasdad12312asdadasdad12312 asdadasdad12312
常用DOM操作
添加,移除class标签
addClass
removeClass
from pyquery import PyQuery as pq html = '''hello nihaoasdasdasdadasdad12312asdadasdad12312asdadasdad12312''' doc = pq(html) its=doc("link").items() for it in its: print("添加:%s"%it.addClass('active1')) print("移除:%s"%it.removeClass('active1'))
运行结果:
添加:asdadasdad12312移除:asdadasdad12312添加:asdadasdad12312移除:asdadasdad12312添加:asdadasdad12312移除:asdadasdad12312
需要注意的是已经存在的class标签不会继续添加
from pyquery import PyQuery as pq html = '''hello nihaoasdasdasdadasdad12312asdadasdad12312asdadasdad12312''' doc = pq(html) its=doc("link").items() for it in its: print("修改:%s"%it.attr('class','active')) print("添加:%s"%it.css('font-size','14px'))
修改:asdadasdad12312添加:asdadasdad12312修改:asdadasdad12312添加:asdadasdad12312修改:asdadasdad12312添加:asdadasdad12312
attr css操作直接修改对象的
remove
remove 移除标签
from pyquery import PyQuery as pq html = '''hello nihaoasdasdasdadasdad12312asdadasdad12312asdadasdad12312''' doc = pq(html) its=doc("div") print('移除前获取文本结果:\n%s'%its.text()) it=its.remove('ul') print('移除后获取文本结果:\n%s'%it.text())
运行结果:
移除前获取文本结果: hello nihao asdasd asdadasdad12312 asdadasdad12312 asdadasdad12312 移除后获取文本结果: hello nihao
其他DOM方法参考:
http://pyquery.readthedocs.io/en/latest/api.html
伪类选择器
from pyquery import PyQuery as pq html = '''hello nihaoasdasdhelloasdadasdad12312asdadasdad12312asdadasdad12312''' doc = pq(html) its=doc("link:first-child") print('第一个标签:%s'%its) its=doc("link:last-child") print('最后一个标签:%s'%its) its=doc("link:nth-child(2)") print('第二个标签:%s'%its) its=doc("link:gt(0)") #从零开始 print("获取0以后的标签:%s"%its) its=doc("link:nth-child(2n-1)") print("获取奇数标签:%s"%its) its=doc("link:contains('hello')") print("获取文本包含hello的标签:%s"%its)
运行结果:
第一个标签:helloasdadasdad12312最后一个标签:asdadasdad12312第二个标签:asdadasdad12312获取0以后的标签:asdadasdad12312asdadasdad12312获取奇数标签:helloasdadasdad12312asdadasdad12312获取文本包含hello的标签:helloasdadasdad12312
Copyright © 2004-2024 Ynicp.com 版权所有 法律顾问:建纬(昆明)律师事务所 昆明市网翼通科技有限公司 滇ICP备08002592号-4