# 使用urllib来获取百度首页的源码 import urllib.request # (1)定义一个url 就是你要访问的地址 url = 'http://www.baidu.com' # (2)模拟浏览器向服务器发送请求 response响应 response = urllib.request.urlopen(url) # (3)获取响应中的页面的源码 content 内容的意思 # read方法 返回的是字节形式的二进制数据 # 我们要将二进制的数据转换为字符串 # 二进制--》字符串 解码 decode('编码的格式') content = response.read().decode('utf-8') # (4)打印数据 print(content)
import urllib.request url = 'http://www.baidu.com' # 模拟浏览器向服务器发送请求 response = urllib.request.urlopen(url) # 一个类型和六个方法 # response是HTTPResponse的类型 # print(type(response)) # 按照一个字节一个字节的去读 # content = response.read() # print(content) # 返回多少个字节 # content = response.read(5) # print(content) # 读取一行 # content = response.readline() # print(content) # content = response.readlines() # print(content) # 返回状态码 如果是200了 那么就证明我们的逻辑没有错 # print(response.getcode()) # 返回的是url地址 # print(response.geturl()) # 获取是一个状态信息 print(response.getheaders()) # 一个类型 HTTPResponse # 六个方法 read readline readlines getcode geturl getheaders
import urllib.request # 下载网页 # url_page = 'http://www.baidu.com' # url代表的是下载的路径 filename文件的名字 # 在python中 可以变量的名字 也可以直接写值 # urllib.request.urlretrieve(url_page,'baidu.html') # 下载图片 # url_img = 'https://img1.baidu.com/it/u=3004965690,4089234593&fm=26&fmt=auto&gp=0.jpg' # # urllib.request.urlretrieve(url= url_img,filename='lisa.jpg') # 下载视频 url_video = 'https://vd3.bdstatic.com/mda-mhkku4ndaka5etk3/1080p/cae_h264/1629557146541497769/mda-mhkku4ndaka5etk3.mp4?v_from_s=hkapp-haokan-tucheng&auth_key=1629687514-0-0-7ed57ed7d1168bb1f06d18a4ea214300&bcevod_channel=searchbox_feed&pd=1&pt=3&abtest=' urllib.request.urlretrieve(url_video,'hxekyyds.mp4')
import urllib.request url = 'https://www.baidu.com' # url的组成 # https://www.baidu.com/s?wd=周杰伦 # http/https www.baidu.com 80/443 s wd = 周杰伦 # # 协议 主机 端口号 路径 参数 锚点 # http 80 # https 443 # mysql 3306 # oracle 1521 # redis 6379 # mongodb 27017 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } # 因为urlopen方法中不能存储字典 所以headers不能传递进去 # 请求对象的定制 request = urllib.request.Request(url=url,headers=headers) response = urllib.request.urlopen(request) content = response.read().decode('utf8') print(content)
# https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6 # 需求 获取 https://www.baidu.com/s?wd=周杰伦的网页源码 import urllib.request import urllib.parse url = 'https://www.baidu.com/s?wd=' # 请求对象的定制为了解决反爬的第一种手段 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } # 将周杰伦三个字变成unicode编码的格式 # 我们需要依赖于urllib.parse name = urllib.parse.quote('周杰伦') url = url + name # 请求对象的定制 request = urllib.request.Request(url=url,headers=headers) # 模拟浏览器向服务器发送请求 response = urllib.request.urlopen(request) # 获取响应的内容 content = response.read().decode('utf-8') # 打印数据 print(content)
# urlencode应用场景:多个参数的时候 # https://www.baidu.com/s?wd=周杰伦&sex=男 # import urllib.parse # # data = { # 'wd':'周杰伦', # 'sex':'男', # 'location':'中国台湾省' # } # # a = urllib.parse.urlencode(data) # print(a) #获取https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7的网页源码 import urllib.request import urllib.parse base_url = 'https://www.baidu.com/s?' data = { 'wd':'周杰伦', 'sex':'男', 'location':'中国台湾省' } new_data = urllib.parse.urlencode(data) # 请求资源路径 url = base_url + new_data headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } # 请求对象的定制 request = urllib.request.Request(url=url,headers=headers) # 模拟浏览器向服务器发送请求 response = urllib.request.urlopen(request) # 获取网页源码的数据 content = response.read().decode('utf-8') # 打印数据 print(content)
# post请求 import urllib.request import urllib.parse url = 'https://fanyi.baidu.com/sug' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } data = { 'kw':'spider' } # post请求的参数 必须要进行编码 data = urllib.parse.urlencode(data).encode('utf-8') # post的请求的参数 是不会拼接在url的后面的 而是需要放在请求对象定制的参数中 # post请求的参数 必须要进行编码 request = urllib.request.Request(url=url,data=data,headers=headers) # 模拟浏览器向服务器发送请求 response = urllib.request.urlopen(request) # 获取响应的数据 content = response.read().decode('utf-8') # 字符串--》json对象 import json obj = json.loads(content) print(obj) # post请求方式的参数 必须编码 data = urllib.parse.urlencode(data) # 编码之后 必须调用encode方法 data = urllib.parse.urlencode(data).encode('utf-8') # 参数是放在请求对象定制的方法中 request = urllib.request.Request(url=url,data=data,headers=headers)
import urllib.request import urllib.parse url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh' headers = { # 'Accept': '*/*', # 'Accept-Encoding': 'gzip, deflate, br', # 'Accept-Language': 'zh-CN,zh;q=0.9', # 'Connection': 'keep-alive', # 'Content-Length': '135', # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie': 'BIDUPSID=DAA8F9F0BD801A2929D96D69CF7EBF50; PSTM=1597202227; BAIDUID=DAA8F9F0BD801A29B2813502000BF8E9:SL=0:NR=10:FG=1; __yjs_duid=1_c19765bd685fa6fa12c2853fc392f8db1618999058029; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; BDUSS=R2bEZvTjFCNHQxdUV-cTZ-MzZrSGxhbUYwSkRkUWk2SkxxS3E2M2lqaFRLUlJoRVFBQUFBJCQAAAAAAAAAAAEAAAA3e~BTveK-9sHLZGF5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFOc7GBTnOxgaW; BDUSS_BFESS=R2bEZvTjFCNHQxdUV-cTZ-MzZrSGxhbUYwSkRkUWk2SkxxS3E2M2lqaFRLUlJoRVFBQUFBJCQAAAAAAAAAAAEAAAA3e~BTveK-9sHLZGF5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFOc7GBTnOxgaW; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=DAA8F9F0BD801A29B2813502000BF8E9:SL=0:NR=10:FG=1; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=2; H_PS_PSSID=34435_31660_34405_34004_34073_34092_26350_34426_34323_22158_34390; delPer=1; BA_HECTOR=8185a12020018421b61gi6ka20q; BCLID=10943521300863382545; BDSFRCVID=boDOJexroG0YyvRHKn7hh7zlD_weG7bTDYLEOwXPsp3LGJLVJeC6EG0Pts1-dEu-EHtdogKK0mOTHv8F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tR3aQ5rtKRTffjrnhPF3-44vXP6-hnjy3bRkX4Q4Wpv_Mnndjn6SQh4Wbttf5q3RymJ42-39LPO2hpRjyxv4y4Ldj4oxJpOJ-bCL0p5aHl51fbbvbURvD-ug3-7qqU5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIE3-oJqC8hMIt43f; BCLID_BFESS=10943521300863382545; BDSFRCVID_BFESS=boDOJexroG0YyvRHKn7hh7zlD_weG7bTDYLEOwXPsp3LGJLVJeC6EG0Pts1-dEu-EHtdogKK0mOTHv8F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR3aQ5rtKRTffjrnhPF3-44vXP6-hnjy3bRkX4Q4Wpv_Mnndjn6SQh4Wbttf5q3RymJ42-39LPO2hpRjyxv4y4Ldj4oxJpOJ-bCL0p5aHl51fbbvbURvD-ug3-7qqU5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIE3-oJqC8hMIt43f; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1629701482,1629702031,1629702343,1629704515; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1629704515; __yjs_st=2_MDBkZDdkNzg4YzYyZGU2NTM5NzBjZmQ0OTZiMWRmZGUxM2QwYzkwZTc2NTZmMmIxNDJkYzk4NzU1ZDUzN2U3Yjc4ZTJmYjE1YTUzMTljYWFkMWUwYmVmZGEzNmZjN2FlY2M3NDAzOThhZTY5NzI0MjVkMmQ0NWU3MWE1YTJmNGE5NDBhYjVlOWY3MTFiMWNjYTVhYWI0YThlMDVjODBkNWU2NjMwMzY2MjFhZDNkMzVhNGMzMGZkMWY2NjU5YzkxMDk3NTEzODJiZWUyMjEyYTk5YzY4ODUyYzNjZTJjMGM5MzhhMWE5YjU3NTM3NWZiOWQxNmU3MDVkODExYzFjN183XzliY2RhYjgz; ab_sr=1.0.1_ZTc2ZDFkMTU5ZTM0ZTM4MWVlNDU2MGEzYTM4MzZiY2I2MDIxNzY1Nzc1OWZjZGNiZWRhYjU5ZjYwZmNjMTE2ZjIzNmQxMTdiMzIzYTgzZjVjMTY0ZjM1YjMwZTdjMjhiNDRmN2QzMjMwNWRhZmUxYTJjZjZhNTViMGM2ODFlYjE5YTlmMWRjZDAwZGFmMDY4ZTFlNGJiZjU5YzE1MGIxN2FiYTU3NDgzZmI4MDdhMDM5NTQ0MjQxNDBiNzdhMDdl', # 'Host': 'fanyi.baidu.com', # 'Origin': 'https://fanyi.baidu.com', # 'Referer': 'https://fanyi.baidu.com/?aldtype=16047', # 'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"', # 'sec-ch-ua-mobile': '?0', # 'Sec-Fetch-Dest': 'empty', # 'Sec-Fetch-Mode': 'cors', # 'Sec-Fetch-Site': 'same-origin', # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36', # 'X-Requested-With': 'XMLHttpRequest', } data = { 'from': 'en', 'to': 'zh', 'query': 'love', 'transtype': 'realtime', 'simple_means_flag': '3', 'sign': '198772.518981', 'token': '......', 'domain': 'common', } # post请求的参数 必须进行编码 并且要调用encode方法 data = urllib.parse.urlencode(data).encode('utf-8') # 请求对象的定制 request = urllib.request.Request(url = url,data = data,headers = headers) # 模拟浏览器向服务器发送请求 response = urllib.request.urlopen(request) # 获取响应的数据 content = response.read().decode('utf-8') import json obj = json.loads(content) print(obj)
# get请求 # 获取豆瓣电影的第一页的数据 并且保存起来 import urllib.request url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } # (1) 请求对象的定制 request = urllib.request.Request(url=url,headers=headers) # (2)获取响应的数据 response = urllib.request.urlopen(request) content = response.read().decode('utf-8') # (3) 数据下载到本地 # open方法默认情况下使用的是gbk的编码 如果我们要想保存汉字 那么需要在open方法中指定编码格式为utf-8 # encoding = 'utf-8' # fp = open('douban.json','w',encoding='utf-8') # fp.write(content) with open('douban1.json','w',encoding='utf-8') as fp: fp.write(content)
# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=& # start=0&limit=20 # https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=& # start=20&limit=20 # https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=& # start=40&limit=20 # https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=& # start=60&limit=20 # page 1 2 3 4 # start 0 20 40 60 # start (page - 1)*20 # 下载豆瓣电影前10页的数据 # (1) 请求对象的定制 # (2) 获取响应的数据 # (3) 下载数据 import urllib.parse import urllib.request def create_request(page): base_url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&' data = { 'start':(page - 1) * 20, 'limit':20 } data = urllib.parse.urlencode(data) url = base_url + data headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } request = urllib.request.Request(url=url,headers=headers) return request def get_content(request): response = urllib.request.urlopen(request) content = response.read().decode('utf-8') return content def down_load(page,content): with open('douban_' + str(page) + '.json','w',encoding='utf-8')as fp: fp.write(content) # 程序的入口 if __name__ == '__main__': start_page = int(input('请输入起始的页码')) end_page = int(input('请输入结束的页面')) for page in range(start_page,end_page+1): # 每一页都有自己的请求对象的定制 request = create_request(page) # 获取响应的数据 content = get_content(request) # 下载 down_load(page,content)
# 1页 # http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname # post # cname: 北京 # pid: # pageIndex: 1 # pageSize: 10 # 2页 # http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname # post # cname: 北京 # pid: # pageIndex: 2 # pageSize: 10 import urllib.request import urllib.parse def create_request(page): base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname' data = { 'cname': '北京', 'pid':'', 'pageIndex': page, 'pageSize': '10' } data = urllib.parse.urlencode(data).encode('utf-8') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } request = urllib.request.Request(url=base_url,headers=headers,data=data) return request def get_content(request): response = urllib.request.urlopen(request) content = response.read().decode('utf-8') return content def down_load(page,content): with open('kfc_' + str(page) + '.json','w',encoding='utf-8')as fp: fp.write(content) if __name__ == '__main__': start_page = int(input('请输入起始页码')) end_page = int(input('请输入结束页码')) for page in range(start_page,end_page+1): # 请求对象的定制 request = create_request(page) # 获取网页源码 content = get_content(request) # 下载 down_load(page,content)
import urllib.request import urllib.error # url = 'https://blog.csdn.net/sulixu/article/details/1198189491' url = 'http://www.doudan1111.com' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } try: request = urllib.request.Request(url = url, headers = headers) response = urllib.request.urlopen(request) content = response.read().decode('utf-8') print(content) except urllib.error.HTTPError: print('系统正在升级。。。') except urllib.error.URLError: print('我都说了 系统正在升级。。。')
# 适用的场景:数据采集的时候 需要绕过登陆 然后进入到某个页面 # 个人信息页面是utf-8 但是还报错了编码错误 因为并没有进入到个人信息页面 而是跳转到了登陆页面 # 那么登陆页面不是utf-8 所以报错 # 什么情况下访问不成功? # 因为请求头的信息不够 所以访问不成功 import urllib.request url = 'https://weibo.cn/6451491586/info' headers = { # ':authority': 'weibo.cn', # ':method': 'GET', # ':path': '/6451491586/info', # ':scheme': 'https', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', # 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', # cookie中携带着你的登陆信息 如果有登陆之后的cookie 那么我们就可以携带着cookie进入到任何页面 'cookie': '_T_WM=24c44910ba98d188fced94ba0da5960e; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFxxfgNNUmXi4YiaYZKr_J_5NHD95QcSh-pSh.pSKncWs4DqcjiqgSXIgvVPcpD; SUB=_2A25MKKG_DeRhGeBK7lMV-S_JwzqIHXVv0s_3rDV6PUJbktCOLXL2kW1NR6e0UHkCGcyvxTYyKB2OV9aloJJ7mUNz; SSOLoginState=1630327279', # referer 判断当前路径是不是由上一个路径进来的 一般情况下 是做图片防盗链 'referer': 'https://weibo.cn/', 'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"', 'sec-ch-ua-mobile': '?0', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36', } # 请求对象的定制 request = urllib.request.Request(url=url,headers=headers) # 模拟浏览器向服务器发送请求 response = urllib.request.urlopen(request) # 获取响应的数据 content = response.read().decode('utf-8') # 将数据保存到本地 with open('weibo.html','w',encoding='utf-8')as fp: fp.write(content)
# 需求 使用handler来访问百度 获取网页源码 import urllib.request url = 'http://www.baidu.com' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } request = urllib.request.Request(url = url,headers = headers) # handler build_opener open # (1)获取hanlder对象 handler = urllib.request.HTTPHandler() # (2)获取opener对象 opener = urllib.request.build_opener(handler) # (3) 调用open方法 response = opener.open(request) content = response.read().decode('utf-8') print(content)
import urllib.request url = 'http://www.baidu.com/s?wd=ip' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } # 请求对象的定制 request = urllib.request.Request(url = url,headers= headers) # 模拟浏览器访问服务器 # response = urllib.request.urlopen(request) proxies = { 'http':'118.24.219.151:16817' } # handler build_opener open handler = urllib.request.ProxyHandler(proxies = proxies) opener = urllib.request.build_opener(handler) response = opener.open(request) # 获取响应的信息 content = response.read().decode('utf-8') # 保存 with open('daili.html','w',encoding='utf-8')as fp: fp.write(content)
import urllib.request proxies_pool = [ {'http':'118.24.219.151:16817'}, {'http':'118.24.219.151:16817'}, ] import random proxies = random.choice(proxies_pool) url = 'http://www.baidu.com/s?wd=ip' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } request = urllib.request.Request(url = url,headers=headers) handler = urllib.request.ProxyHandler(proxies=proxies) opener = urllib.request.build_opener(handler) response = opener.open(request) content = response.read().decode('utf-8') with open('daili.html','w',encoding='utf-8')as fp: fp.write(content)
Title
from lxml import etree # xpath解析 # (1)本地文件 etree.parse # (2)服务器响应的数据 response.read().decode('utf-8') ***** etree.HTML() # xpath解析本地文件 tree = etree.parse('爬虫_解析_xpath的基本使用.html') #tree.xpath('xpath路径') # 查找ul下面的li # li_list = tree.xpath('//body/ul/li') # 查找所有有id的属性的li标签 # text()获取标签中的内容 # li_list = tree.xpath('//ul/li[@id]/text()') # 找到id为l1的li标签 注意引号的问题 # li_list = tree.xpath('//ul/li[@id="l1"]/text()') # 查找到id为l1的li标签的class的属性值 # li = tree.xpath('//ul/li[@id="l1"]/@class') # 查询id中包含l的li标签 # li_list = tree.xpath('//ul/li[contains(@id,"l")]/text()') # 查询id的值以l开头的li标签 # li_list = tree.xpath('//ul/li[starts-with(@id,"c")]/text()') #查询id为l1和class为c1的 # li_list = tree.xpath('//ul/li[@id="l1" and @]/text()') li_list = tree.xpath('//ul/li[@id="l1"]/text() | //ul/li[@id="l2"]/text()') # 判断列表的长度 print(li_list) print(len(li_list))
# (1) 获取网页的源码 # (2) 解析 解析的服务器响应的文件 etree.HTML # (3) 打印 import urllib.request url = 'https://www.baidu.com/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } # 请求对象的定制 request = urllib.request.Request(url = url,headers = headers) # 模拟浏览器访问服务器 response = urllib.request.urlopen(request) # 获取网页源码 content = response.read().decode('utf-8') # 解析网页源码 来获取我们想要的数据 from lxml import etree # 解析服务器响应的文件 tree = etree.HTML(content) # 获取想要的数据 xpath的返回值是一个列表类型的数据 result = tree.xpath('//input[@id="su"]/@value')[0] print(result)
# (1) 请求对象的定制 # (2)获取网页的源码 # (3)下载 # 需求 下载的前十页的图片 # https://sc.chinaz.com/tupian/qinglvtupian.html 1 # https://sc.chinaz.com/tupian/qinglvtupian_page.html import urllib.request from lxml import etree def create_request(page): if(page == 1): url = 'https://sc.chinaz.com/tupian/qinglvtupian.html' else: url = 'https://sc.chinaz.com/tupian/qinglvtupian_' + str(page) + '.html' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36', } request = urllib.request.Request(url = url, headers = headers) return request def get_content(request): response = urllib.request.urlopen(request) content = response.read().decode('utf-8') return content def down_load(content): # 下载图片 # urllib.request.urlretrieve('图片地址','文件的名字') tree = etree.HTML(content) name_list = tree.xpath('//div[@id="container"]//a/img/@alt') # 一般设计图片的网站都会进行懒加载 src_list = tree.xpath('//div[@id="container"]//a/img/@src2') for i in range(len(name_list)): name = name_list[i] src = src_list[i] url = 'https:' + src urllib.request.urlretrieve(url=url,filename='./loveImg/' + name + '.jpg') if __name__ == '__main__': start_page = int(input('请输入起始页码')) end_page = int(input('请输入结束页码')) for page in range(start_page,end_page+1): # (1) 请求对象的定制 request = create_request(page) # (2)获取网页的源码 content = get_content(request) # (3)下载 down_load(content)
{ "store": { "book": [ { "category": "修真", "author": "六道", "title": "坏蛋是怎样练成的", "price": 8.95 }, { "category": "修真", "author": "天蚕土豆", "title": "斗破苍穹", "price": 12.99 }, { "category": "修真", "author": "唐家三少", "title": "斗罗大陆", "isbn": "0-553-21311-3", "price": 8.99 }, { "category": "修真", "author": "南派三叔", "title": "星辰变", "isbn": "0-395-19395-8", "price": 22.99 } ], "bicycle": { "author": "老马", "color": "黑色", "price": 19.95 } } }
import json import jsonpath obj = json.load(open('爬虫_解析_jsonpath.json','r',encoding='utf-8')) # 书店所有书的作者 # author_list = jsonpath.jsonpath(obj,'$.store.book[*].author') # print(author_list) # 所有的作者 # author_list = jsonpath.jsonpath(obj,'$..author') # print(author_list) # store下面的所有的元素 # tag_list = jsonpath.jsonpath(obj,'$.store.*') # print(tag_list) # store里面所有东西的price # price_list = jsonpath.jsonpath(obj,'$.store..price') # print(price_list) # 第三个书 # book = jsonpath.jsonpath(obj,'$..book[2]') # print(book) # 最后一本书 # book = jsonpath.jsonpath(obj,'$..book[(@.length-1)]') # print(book) # 前面的两本书 # book_list = jsonpath.jsonpath(obj,'$..book[0,1]') # book_list = jsonpath.jsonpath(obj,'$..book[:2]') # print(book_list) # 条件过滤需要在()的前面添加一个? # 过滤出所有的包含isbn的书。 # book_list = jsonpath.jsonpath(obj,'$..book[?(@.isbn)]') # print(book_list) # 哪本书超过了10块钱 book_list = jsonpath.jsonpath(obj,'$..book[?(@.price>10)]') print(book_list)
{"returnCode":"0","returnValue":{"A":[{"id":3643,"parentId":0,"regionName":"阿坝","cityCode":513200,"pinYin":"ABA"},{"id":3090,"parentId":0,"regionName":"阿克苏","cityCode":652900,"pinYin":"AKESU"},{"id":3632,"parentId":0,"regionName":"阿拉善","cityCode":152900,"pinYin":"ALASHAN"},{"id":899,"parentId":0,"regionName":"安康","cityCode":610900,"pinYin":"ANKANG"},{"id":196,"parentId":0,"regionName":"安庆","cityCode":340800,"pinYin":"ANQING"},{"id":758,"parentId":0,"regionName":"鞍山","cityCode":210300,"pinYin":"ANSHAN"},{"id":388,"parentId":0,"regionName":"安顺","cityCode":520400,"pinYin":"ANSHUN"},{"id":454,"parentId":0,"regionName":"安阳","cityCode":410500,"pinYin":"ANYANG"}],"B":[{"id":3633,"parentId":0,"regionName":"白城","cityCode":220800,"pinYin":"BAICHENG"},{"id":356,"parentId":0,"regionName":"百色","cityCode":451000,"pinYin":"BAISE"},{"id":634,"parentId":0,"regionName":"白山","cityCode":220600,"pinYin":"BAISHAN"},{"id":275,"parentId":0,"regionName":"白银","cityCode":620400,"pinYin":"BAIYIN"},{"id":426,"parentId":0,"regionName":"保定","cityCode":130600,"pinYin":"BAODING"},{"id":188,"parentId":0,"regionName":"宝鸡","cityCode":610300,"pinYin":"BAOJI"},{"id":994,"parentId":0,"regionName":"保山","cityCode":530500,"pinYin":"BAOSHAN"},{"id":1181,"parentId":0,"regionName":"包头","cityCode":150200,"pinYin":"BAOTOU"},{"id":789,"parentId":0,"regionName":"巴彦淖尔","cityCode":150800,"pinYin":"BAYANNAOER"},{"id":925,"parentId":0,"regionName":"巴中","cityCode":511900,"pinYin":"BAZHONG"},{"id":358,"parentId":0,"regionName":"北海","cityCode":450500,"pinYin":"BEIHAI"},{"id":3,"parentId":0,"regionName":"北京","cityCode":110100,"pinYin":"BEIJING"},{"id":200,"parentId":0,"regionName":"蚌埠","cityCode":340300,"pinYin":"BENGBU"},{"id":760,"parentId":0,"regionName":"本溪","cityCode":210500,"pinYin":"BENXI"},{"id":390,"parentId":0,"regionName":"毕节","cityCode":522401,"pinYin":"BIJIE"},{"id":824,"parentId":0,"regionName":"滨州","cityCode":371600,"pinYin":"BINZHOU"},{"id":1126,"parentId":0,"regionName":"亳州","cityCode":341600,"pinYin":"BOZHOU"},{"id":5860,"parentId":0,"regionName":"巴音郭楞","cityCode":652800,"pinYin":"BYGL"}],"C":[{"id":430,"parentId":0,"regionName":"沧州","cityCode":130900,"pinYin":"CANGZHOU"},{"id":623,"parentId":0,"regionName":"长春","cityCode":220100,"pinYin":"CHANGCHUN"},{"id":573,"parentId":0,"regionName":"常德","cityCode":430700,"pinYin":"CHANGDE"},{"id":983,"parentId":0,"regionName":"昌吉","cityCode":652300,"pinYin":"CHANGJI"},{"id":5781,"parentId":0,"regionName":"昌江","cityCode":469026,"pinYin":"CHANGJIANG"},{"id":576,"parentId":0,"regionName":"长沙","cityCode":430100,"pinYin":"CHANGSHA"},{"id":883,"parentId":0,"regionName":"长治","cityCode":140400,"pinYin":"CHANGZHI"},{"id":651,"parentId":0,"regionName":"常州","cityCode":320400,"pinYin":"CHANGZHOU"},{"id":3244,"parentId":0,"regionName":"朝阳","cityCode":211300,"pinYin":"CHAOYANG"},{"id":1138,"parentId":0,"regionName":"潮州","cityCode":445100,"pinYin":"CHAOZHOU"},{"id":433,"parentId":0,"regionName":"承德","cityCode":130800,"pinYin":"CHENGDE"},{"id":70,"parentId":0,"regionName":"成都","cityCode":510100,"pinYin":"CHENGDU"},{"id":5859,"parentId":0,"regionName":"澄迈县","cityCode":469023,"pinYin":"CHENGMAI"},{"id":585,"parentId":0,"regionName":"郴州","cityCode":431000,"pinYin":"CHENZHOU"},{"id":791,"parentId":0,"regionName":"赤峰","cityCode":150400,"pinYin":"CHIFENG"},{"id":205,"parentId":0,"regionName":"池州","cityCode":341700,"pinYin":"CHIZHOU"},{"id":40,"parentId":0,"regionName":"重庆","cityCode":500100,"pinYin":"CHONGQING"},{"id":3640,"parentId":0,"regionName":"崇左","cityCode":451400,"pinYin":"CHONGZUO"},{"id":996,"parentId":0,"regionName":"楚雄","cityCode":532300,"pinYin":"CHUXIONG"},{"id":207,"parentId":0,"regionName":"滁州","cityCode":341100,"pinYin":"CHUZHOU"}],"D":[{"id":998,"parentId":0,"regionName":"大理","cityCode":532900,"pinYin":"DALI"},{"id":763,"parentId":0,"regionName":"大连","cityCode":210200,"pinYin":"DALIAN"},{"id":3071,"parentId":0,"regionName":"儋州","cityCode":460400,"pinYin":"DAN"},{"id":753,"parentId":0,"regionName":"丹东","cityCode":210600,"pinYin":"DANDONG"},{"id":514,"parentId":0,"regionName":"大庆","cityCode":230600,"pinYin":"DAQING"},{"id":885,"parentId":0,"regionName":"大同","cityCode":140200,"pinYin":"DATONG"},{"id":3638,"parentId":0,"regionName":"大兴安岭","cityCode":232700,"pinYin":"DAXINGANLING"},{"id":935,"parentId":0,"regionName":"达州","cityCode":511700,"pinYin":"DAZHOU"},{"id":3650,"parentId":0,"regionName":"德宏","cityCode":533100,"pinYin":"DEHONG"},{"id":937,"parentId":0,"regionName":"德阳","cityCode":510600,"pinYin":"DEYANG"},{"id":827,"parentId":0,"regionName":"德州","cityCode":371400,"pinYin":"DEZHOU"},{"id":5884,"parentId":0,"regionName":"定安","cityCode":469021,"pinYin":"DINGANXIAN"},{"id":1135,"parentId":0,"regionName":"定西","cityCode":621100,"pinYin":"DINGXI"},{"id":1000,"parentId":0,"regionName":"迪庆","cityCode":533400,"pinYin":"DIQINGZANGZU"},{"id":5742,"parentId":0,"regionName":"东方","cityCode":469007,"pinYin":"DONGFANG"},{"id":109,"parentId":0,"regionName":"东莞","cityCode":441900,"pinYin":"DONGGUAN"},{"id":829,"parentId":0,"regionName":"东营","cityCode":370500,"pinYin":"DONGYING"}],"E":[{"id":793,"parentId":0,"regionName":"鄂尔多斯","cityCode":150600,"pinYin":"EERDUOSI"},{"id":541,"parentId":0,"regionName":"恩施","cityCode":422800,"pinYin":"ENSHI"},{"id":543,"parentId":0,"regionName":"鄂州","cityCode":420700,"pinYin":"EZHOU"}],"F":[{"id":360,"parentId":0,"regionName":"防城港","cityCode":450600,"pinYin":"FANGCHENGGANG"},{"id":61,"parentId":0,"regionName":"佛山","cityCode":440600,"pinYin":"FOSHAN"},{"id":770,"parentId":0,"regionName":"抚顺","cityCode":210400,"pinYin":"FUSHUN"},{"id":1176,"parentId":0,"regionName":"阜新","cityCode":210900,"pinYin":"FUXIN"},{"id":1125,"parentId":0,"regionName":"阜阳","cityCode":341200,"pinYin":"FUYANG"},{"id":745,"parentId":0,"regionName":"抚州","cityCode":361000,"pinYin":"FUZHOU"},{"id":98,"parentId":0,"regionName":"福州","cityCode":350100,"pinYin":"FUZHOU"}],"G":[{"id":3658,"parentId":0,"regionName":"甘南","cityCode":623000,"pinYin":"GANNAN"},{"id":718,"parentId":0,"regionName":"赣州","cityCode":360700,"pinYin":"GANZHOU"},{"id":3644,"parentId":0,"regionName":"甘孜","cityCode":513300,"pinYin":"GANZI"},{"id":2166,"parentId":43,"regionName":"巩义市","cityCode":410181,"pinYin":"GONGYI","selected":1},{"id":3642,"parentId":0,"regionName":"广安","cityCode":511600,"pinYin":"GUANGAN"},{"id":3453,"parentId":0,"regionName":"广元","cityCode":510800,"pinYin":"GUANGYUAN"},{"id":8,"parentId":0,"regionName":"广州","cityCode":440100,"pinYin":"GUANGZHOU"},{"id":362,"parentId":0,"regionName":"贵港","cityCode":450800,"pinYin":"GUIGANG"},{"id":364,"parentId":0,"regionName":"桂林","cityCode":450300,"pinYin":"GUILIN"},{"id":394,"parentId":0,"regionName":"贵阳","cityCode":520100,"pinYin":"GUIYANG"},{"id":1183,"parentId":0,"regionName":"固原","cityCode":640400,"pinYin":"GUYUAN"}],"H":[{"id":508,"parentId":0,"regionName":"哈尔滨","cityCode":230100,"pinYin":"HAERBIN"},{"id":3659,"parentId":0,"regionName":"海东","cityCode":630200,"pinYin":"HAIDONG"},{"id":414,"parentId":0,"regionName":"海口","cityCode":460100,"pinYin":"HAIKOU"},{"id":5788,"parentId":0,"regionName":"海南州","cityCode":632500,"pinYin":"HAINANZHOU"},{"id":3665,"parentId":0,"regionName":"海西","cityCode":632800,"pinYin":"HAIXI"},{"id":3669,"parentId":0,"regionName":"哈密","cityCode":652200,"pinYin":"HAMI"},{"id":435,"parentId":0,"regionName":"邯郸","cityCode":130400,"pinYin":"HANDAN"},{"id":16,"parentId":0,"regionName":"杭州","cityCode":330100,"pinYin":"HANGZHOU","selected":0},{"id":902,"parentId":0,"regionName":"汉中","cityCode":610700,"pinYin":"HANZHONG"},{"id":460,"parentId":0,"regionName":"鹤壁","cityCode":410600,"pinYin":"HEBI"},{"id":1144,"parentId":0,"regionName":"河池","cityCode":451200,"pinYin":"HECHI"},{"id":210,"parentId":0,"regionName":"合肥","cityCode":340100,"pinYin":"HEFEI"},{"id":1154,"parentId":0,"regionName":"鹤岗","cityCode":230400,"pinYin":"HEGANG"},{"id":3637,"parentId":0,"regionName":"黑河","cityCode":231100,"pinYin":"HEIHE"},{"id":1148,"parentId":0,"regionName":"衡水","cityCode":131100,"pinYin":"HENGSHUI"},{"id":587,"parentId":0,"regionName":"衡阳","cityCode":430400,"pinYin":"HENGYANG"},{"id":3673,"parentId":0,"regionName":"和田","cityCode":653200,"pinYin":"HETIAN"},{"id":319,"parentId":0,"regionName":"河源","cityCode":441600,"pinYin":"HEYUAN"},{"id":832,"parentId":0,"regionName":"菏泽","cityCode":371700,"pinYin":"HEZE"},{"id":370,"parentId":0,"regionName":"贺州","cityCode":451100,"pinYin":"HEZHOU"},{"id":1002,"parentId":0,"regionName":"红河","cityCode":532500,"pinYin":"HONGHE"},{"id":666,"parentId":0,"regionName":"淮安","cityCode":320800,"pinYin":"HUAIAN"},{"id":1127,"parentId":0,"regionName":"淮北","cityCode":340600,"pinYin":"HUAIBEI"},{"id":590,"parentId":0,"regionName":"怀化","cityCode":431200,"pinYin":"HUAIHUA"},{"id":215,"parentId":0,"regionName":"淮南","cityCode":340400,"pinYin":"HUAINAN"},{"id":547,"parentId":0,"regionName":"黄冈","cityCode":421100,"pinYin":"HUANGGANG"},{"id":3661,"parentId":0,"regionName":"黄南","cityCode":632300,"pinYin":"HUANGNAN"},{"id":217,"parentId":0,"regionName":"黄山","cityCode":341000,"pinYin":"HUANGSHAN"},{"id":550,"parentId":0,"regionName":"黄石","cityCode":420200,"pinYin":"HUANGSHI"},{"id":796,"parentId":0,"regionName":"呼和浩特","cityCode":150100,"pinYin":"HUHEHAOTE"},{"id":163,"parentId":0,"regionName":"惠州","cityCode":441300,"pinYin":"HUIZHOU"},{"id":776,"parentId":0,"regionName":"葫芦岛","cityCode":211400,"pinYin":"HULUDAO"},{"id":801,"parentId":0,"regionName":"呼伦贝尔","cityCode":150700,"pinYin":"HULUNBEIER"},{"id":173,"parentId":0,"regionName":"湖州","cityCode":330500,"pinYin":"HUZHOU"}],"J":[{"id":523,"parentId":0,"regionName":"佳木斯","cityCode":230800,"pinYin":"JIAMUSI"},{"id":747,"parentId":0,"regionName":"吉安","cityCode":360800,"pinYin":"JIAN"},{"id":317,"parentId":0,"regionName":"江门","cityCode":440700,"pinYin":"JIANGMEN"},{"id":462,"parentId":0,"regionName":"焦作","cityCode":410800,"pinYin":"JIAOZUO"},{"id":156,"parentId":0,"regionName":"嘉兴","cityCode":330400,"pinYin":"JIAXING"},{"id":1136,"parentId":0,"regionName":"嘉峪关","cityCode":620200,"pinYin":"JIAYUGUAN"},{"id":327,"parentId":0,"regionName":"揭阳","cityCode":445200,"pinYin":"JIEYANG"},{"id":628,"parentId":0,"regionName":"吉林","cityCode":220200,"pinYin":"JILIN"},{"id":837,"parentId":0,"regionName":"济南","cityCode":370100,"pinYin":"JINAN"},{"id":3556,"parentId":0,"regionName":"金昌","cityCode":620300,"pinYin":"JINCHANG"},{"id":892,"parentId":0,"regionName":"晋城","cityCode":140500,"pinYin":"JINCHENG"},{"id":724,"parentId":0,"regionName":"景德镇","cityCode":360200,"pinYin":"JINGDEZHEN"},{"id":536,"parentId":0,"regionName":"荆门","cityCode":420800,"pinYin":"JINGMEN"},{"id":545,"parentId":0,"regionName":"荆州","cityCode":421000,"pinYin":"JINGZHOU"},{"id":142,"parentId":0,"regionName":"金华","cityCode":330700,"pinYin":"JINHUA"},{"id":842,"parentId":0,"regionName":"济宁","cityCode":370800,"pinYin":"JINING"},{"id":894,"parentId":0,"regionName":"晋中","cityCode":140700,"pinYin":"JINZHONG"},{"id":779,"parentId":0,"regionName":"锦州","cityCode":210700,"pinYin":"JINZHOU"},{"id":726,"parentId":0,"regionName":"九江","cityCode":360400,"pinYin":"JIUJIANG"},{"id":277,"parentId":0,"regionName":"酒泉","cityCode":620900,"pinYin":"JIUQUAN"},{"id":521,"parentId":0,"regionName":"鸡西","cityCode":230300,"pinYin":"JIXI"},{"id":1102,"parentId":0,"regionName":"济源","cityCode":410881,"pinYin":"JIYUAN"}],"K":[{"id":466,"parentId":0,"regionName":"开封","cityCode":410200,"pinYin":"KAIFENG"},{"id":985,"parentId":0,"regionName":"喀什","cityCode":653100,"pinYin":"KASHEN"},{"id":3667,"parentId":0,"regionName":"克拉玛依","cityCode":650200,"pinYin":"KELAMAYI"},{"id":3672,"parentId":0,"regionName":"克孜勒苏柯尔克孜","cityCode":653000,"pinYin":"KEZILESUKEERKEZI"},{"id":18,"parentId":0,"regionName":"昆明","cityCode":530100,"pinYin":"KUNMING"}],"L":[{"id":3639,"parentId":0,"regionName":"来宾","cityCode":451300,"pinYin":"LAIBIN"},{"id":419,"parentId":0,"regionName":"廊坊","cityCode":131000,"pinYin":"LANGFANG"},{"id":279,"parentId":0,"regionName":"兰州","cityCode":620100,"pinYin":"LANZHOU"},{"id":979,"parentId":0,"regionName":"拉萨","cityCode":540100,"pinYin":"LASA"},{"id":940,"parentId":0,"regionName":"乐山","cityCode":511100,"pinYin":"LESHAN"},{"id":3645,"parentId":0,"regionName":"凉山","cityCode":513400,"pinYin":"LIANGSHAN"},{"id":677,"parentId":0,"regionName":"连云港","cityCode":320700,"pinYin":"LIANYUNGANG"},{"id":847,"parentId":0,"regionName":"聊城","cityCode":371500,"pinYin":"LIAOCHENG"},{"id":1178,"parentId":0,"regionName":"辽阳","cityCode":211000,"pinYin":"LIAOYANG"},{"id":630,"parentId":0,"regionName":"辽源","cityCode":220400,"pinYin":"LIAOYUAN"},{"id":992,"parentId":0,"regionName":"丽江","cityCode":530700,"pinYin":"LIJIANG"},{"id":1008,"parentId":0,"regionName":"临沧","cityCode":530900,"pinYin":"LINCANG"},{"id":890,"parentId":0,"regionName":"临汾","cityCode":141000,"pinYin":"LINFEN"},{"id":5590,"parentId":0,"regionName":"临高","cityCode":469024,"pinYin":"LINGAO"},{"id":3498,"parentId":0,"regionName":"临夏","cityCode":622900,"pinYin":"LINXIA"},{"id":849,"parentId":0,"regionName":"临沂","cityCode":371300,"pinYin":"LINYI"},{"id":3657,"parentId":0,"regionName":"林芝","cityCode":542600,"pinYin":"LINZHI"},{"id":1039,"parentId":0,"regionName":"丽水","cityCode":331100,"pinYin":"LISHUI"},{"id":227,"parentId":0,"regionName":"六安","cityCode":341500,"pinYin":"LIUAN"},{"id":406,"parentId":0,"regionName":"六盘水","cityCode":520200,"pinYin":"LIUPANSHUI"},{"id":380,"parentId":0,"regionName":"柳州","cityCode":450200,"pinYin":"LIUZHOU"},{"id":288,"parentId":0,"regionName":"陇南","cityCode":621200,"pinYin":"LONGNAN"},{"id":263,"parentId":0,"regionName":"龙岩","cityCode":350800,"pinYin":"LONGYAN"},{"id":595,"parentId":0,"regionName":"娄底","cityCode":431300,"pinYin":"LOUDI"},{"id":5863,"parentId":0,"regionName":"陵水","cityCode":469028,"pinYin":"LS"},{"id":1194,"parentId":0,"regionName":"吕梁","cityCode":141100,"pinYin":"LULIANG"},{"id":495,"parentId":0,"regionName":"漯河","cityCode":411100,"pinYin":"LUOHE"},{"id":486,"parentId":0,"regionName":"洛阳","cityCode":410300,"pinYin":"LUOYANG"},{"id":959,"parentId":0,"regionName":"泸州","cityCode":510500,"pinYin":"LUZHOU"}],"M":[{"id":170,"parentId":0,"regionName":"马鞍山","cityCode":340500,"pinYin":"MAANSHAN"},{"id":348,"parentId":0,"regionName":"茂名","cityCode":440900,"pinYin":"MAOMING"},{"id":961,"parentId":0,"regionName":"眉山","cityCode":511400,"pinYin":"MEISHAN"},{"id":350,"parentId":0,"regionName":"梅州","cityCode":441400,"pinYin":"MEIZHOU"},{"id":944,"parentId":0,"regionName":"绵阳","cityCode":510700,"pinYin":"MIANYANG"},{"id":528,"parentId":0,"regionName":"牡丹江","cityCode":231000,"pinYin":"MUDANJIANG"}],"N":[{"id":738,"parentId":0,"regionName":"南昌","cityCode":360100,"pinYin":"NANCHANG"},{"id":968,"parentId":0,"regionName":"南充","cityCode":511300,"pinYin":"NANCHONG"},{"id":63,"parentId":0,"regionName":"南京","cityCode":320100,"pinYin":"NANJING"},{"id":372,"parentId":0,"regionName":"南宁","cityCode":450100,"pinYin":"NANNING"},{"id":254,"parentId":0,"regionName":"南平","cityCode":350700,"pinYin":"NANPING"},{"id":132,"parentId":0,"regionName":"南通","cityCode":320600,"pinYin":"NANTONG"},{"id":499,"parentId":0,"regionName":"南阳","cityCode":411300,"pinYin":"NANYANG"},{"id":970,"parentId":0,"regionName":"内江","cityCode":511000,"pinYin":"NEIJIANG"},{"id":147,"parentId":0,"regionName":"宁波","cityCode":330200,"pinYin":"NINGBO"},{"id":268,"parentId":0,"regionName":"宁德","cityCode":350900,"pinYin":"NINGDE"},{"id":3651,"parentId":0,"regionName":"怒江","cityCode":533300,"pinYin":"NUJIANG"}],"P":[{"id":784,"parentId":0,"regionName":"盘锦","cityCode":211100,"pinYin":"PANJIN"},{"id":951,"parentId":0,"regionName":"攀枝花","cityCode":510400,"pinYin":"PANZHIHUA"},{"id":502,"parentId":0,"regionName":"平顶山","cityCode":410400,"pinYin":"PINGDINGSHAN"},{"id":1137,"parentId":0,"regionName":"平凉","cityCode":620800,"pinYin":"PINGLIANG"},{"id":711,"parentId":0,"regionName":"萍乡","cityCode":360300,"pinYin":"PINGXIANG"},{"id":3198,"parentId":0,"regionName":"普洱","cityCode":530800,"pinYin":"PUER"},{"id":271,"parentId":0,"regionName":"莆田","cityCode":350300,"pinYin":"PUTIAN"},{"id":458,"parentId":0,"regionName":"濮阳","cityCode":410900,"pinYin":"PUYANG"}],"Q":[{"id":3647,"parentId":0,"regionName":"黔东南","cityCode":522600,"pinYin":"QIANDONGNAN"},{"id":1158,"parentId":0,"regionName":"潜江","cityCode":429005,"pinYin":"QIANJIANG"},{"id":3648,"parentId":0,"regionName":"黔南","cityCode":522700,"pinYin":"QIANNAN"},{"id":3646,"parentId":0,"regionName":"黔西南","cityCode":522300,"pinYin":"QIANXINAN"},{"id":51,"parentId":0,"regionName":"青岛","cityCode":370200,"pinYin":"QINGDAO"},{"id":3318,"parentId":0,"regionName":"庆阳","cityCode":621000,"pinYin":"QINGYANG"},{"id":102,"parentId":0,"regionName":"清远","cityCode":441800,"pinYin":"QINGYUAN"},{"id":446,"parentId":0,"regionName":"秦皇岛","cityCode":130300,"pinYin":"QINHUANGDAO"},{"id":1145,"parentId":0,"regionName":"钦州","cityCode":450700,"pinYin":"QINZHOU"},{"id":1124,"parentId":0,"regionName":"琼海","cityCode":469002,"pinYin":"QIONGHAI"},{"id":5851,"parentId":0,"regionName":"琼中","cityCode":469030,"pinYin":"QIONGZHONG"},{"id":530,"parentId":0,"regionName":"齐齐哈尔","cityCode":230200,"pinYin":"QIQIHAER"},{"id":3636,"parentId":0,"regionName":"七台河","cityCode":230900,"pinYin":"QITAIHE"},{"id":245,"parentId":0,"regionName":"泉州","cityCode":350500,"pinYin":"QUANZHOU"},{"id":1016,"parentId":0,"regionName":"曲靖","cityCode":530300,"pinYin":"QUJING"},{"id":145,"parentId":0,"regionName":"衢州","cityCode":330800,"pinYin":"QUZHOU"}],"R":[{"id":3654,"parentId":0,"regionName":"日喀则","cityCode":540200,"pinYin":"RIKEZE"},{"id":877,"parentId":0,"regionName":"日照","cityCode":371100,"pinYin":"RIZHAO"}],"S":[{"id":449,"parentId":0,"regionName":"三门峡","cityCode":411200,"pinYin":"SANMENXIA"},{"id":239,"parentId":0,"regionName":"三明","cityCode":350400,"pinYin":"SANMING"},{"id":410,"parentId":0,"regionName":"三亚","cityCode":460200,"pinYin":"SANYA"},{"id":1,"parentId":0,"regionName":"上海","cityCode":310100,"pinYin":"SHANGHAI"},{"id":897,"parentId":0,"regionName":"商洛","cityCode":611000,"pinYin":"SHANGLUO"},{"id":452,"parentId":0,"regionName":"商丘","cityCode":411400,"pinYin":"SHANGQIU"},{"id":713,"parentId":0,"regionName":"上饶","cityCode":361100,"pinYin":"SHANGRAO"},{"id":3653,"parentId":0,"regionName":"山南","cityCode":540500,"pinYin":"SHANNANSHI"},{"id":290,"parentId":0,"regionName":"汕头","cityCode":440500,"pinYin":"SHANTOU"},{"id":294,"parentId":0,"regionName":"汕尾","cityCode":441500,"pinYin":"SHANWEI"},{"id":296,"parentId":0,"regionName":"韶关","cityCode":440200,"pinYin":"SHAOGUAN"},{"id":66,"parentId":0,"regionName":"绍兴","cityCode":330600,"pinYin":"SHAOXING"},{"id":571,"parentId":0,"regionName":"邵阳","cityCode":430500,"pinYin":"SHAOYANG"},{"id":75,"parentId":0,"regionName":"沈阳","cityCode":210100,"pinYin":"SHENYANG"},{"id":28,"parentId":0,"regionName":"深圳","cityCode":440300,"pinYin":"SHENZHEN"},{"id":1200,"parentId":0,"regionName":"石河子","cityCode":659001,"pinYin":"SHIHEZI"},{"id":59,"parentId":0,"regionName":"石家庄","cityCode":130100,"pinYin":"SHIJIAZHUANG"},{"id":68,"parentId":0,"regionName":"十堰","cityCode":420300,"pinYin":"SHIYAN"},{"id":807,"parentId":0,"regionName":"石嘴山","cityCode":640200,"pinYin":"SHIZUISHAN"},{"id":3635,"parentId":0,"regionName":"双鸭山","cityCode":230500,"pinYin":"SHUANGYASHAN"},{"id":3629,"parentId":0,"regionName":"朔州","cityCode":140600,"pinYin":"SHUOZHOU"},{"id":621,"parentId":0,"regionName":"四平","cityCode":220300,"pinYin":"SIPING"},{"id":1174,"parentId":0,"regionName":"松原","cityCode":220700,"pinYin":"SONGYUAN"},{"id":511,"parentId":0,"regionName":"绥化","cityCode":231200,"pinYin":"SUIHUA"},{"id":922,"parentId":0,"regionName":"遂宁","cityCode":510900,"pinYin":"SUINING"},{"id":534,"parentId":0,"regionName":"随州","cityCode":421300,"pinYin":"SUIZHOU"},{"id":644,"parentId":0,"regionName":"宿迁","cityCode":321300,"pinYin":"SUQIAN"},{"id":193,"parentId":0,"regionName":"宿州","cityCode":341300,"pinYin":"SUZHOU"},{"id":107,"parentId":0,"regionName":"苏州","cityCode":320500,"pinYin":"SUZHOU"}],"T":[{"id":3674,"parentId":0,"regionName":"塔城","cityCode":654200,"pinYin":"TACHENG"},{"id":817,"parentId":0,"regionName":"泰安","cityCode":370900,"pinYin":"TAIAN"},{"id":81,"parentId":0,"regionName":"太原","cityCode":140100,"pinYin":"TAIYUAN"},{"id":181,"parentId":0,"regionName":"台州","cityCode":331000,"pinYin":"TAIZHOU"},{"id":640,"parentId":0,"regionName":"泰州","cityCode":321200,"pinYin":"TAIZHOU"},{"id":83,"parentId":0,"regionName":"唐山","cityCode":130200,"pinYin":"TANGSHAN"},{"id":22,"parentId":0,"regionName":"天津","cityCode":120100,"pinYin":"TIANJIN"},{"id":1159,"parentId":0,"regionName":"天门","cityCode":429006,"pinYin":"TIANMEN"},{"id":1119,"parentId":0,"regionName":"天水","cityCode":620500,"pinYin":"TIANSHUI"},{"id":1179,"parentId":0,"regionName":"铁岭","cityCode":211200,"pinYin":"TIELING"},{"id":1187,"parentId":0,"regionName":"铜川","cityCode":610200,"pinYin":"TONGCHUAN"},{"id":619,"parentId":0,"regionName":"通化","cityCode":220500,"pinYin":"TONGHUA"},{"id":787,"parentId":0,"regionName":"通辽","cityCode":150500,"pinYin":"TONGLIAO"},{"id":191,"parentId":0,"regionName":"铜陵","cityCode":340700,"pinYin":"TONGLING"},{"id":386,"parentId":0,"regionName":"铜仁","cityCode":522201,"pinYin":"TONGREN"}],"W":[{"id":5534,"parentId":0,"regionName":"万宁","cityCode":469006,"pinYin":"WANNING"},{"id":821,"parentId":0,"regionName":"潍坊","cityCode":370700,"pinYin":"WEIFANG"},{"id":853,"parentId":0,"regionName":"威海","cityCode":371000,"pinYin":"WEIHAI"},{"id":905,"parentId":0,"regionName":"渭南","cityCode":610500,"pinYin":"WEINAN"},{"id":5773,"parentId":0,"regionName":"文昌","cityCode":469005,"pinYin":"WENCHANG"},{"id":3269,"parentId":0,"regionName":"文山","cityCode":532600,"pinYin":"WENSHAN"},{"id":1047,"parentId":0,"regionName":"温州","cityCode":330300,"pinYin":"WENZHOU"},{"id":803,"parentId":0,"regionName":"乌海","cityCode":150300,"pinYin":"WUHAI"},{"id":10,"parentId":0,"regionName":"武汉","cityCode":420100,"pinYin":"WUHAN"},{"id":219,"parentId":0,"regionName":"芜湖","cityCode":340200,"pinYin":"WUHU"},{"id":5754,"parentId":0,"regionName":"五家渠","cityCode":659004,"pinYin":"WUJIAQU"},{"id":3630,"parentId":0,"regionName":"乌兰察布","cityCode":150900,"pinYin":"WULANCHABU"},{"id":987,"parentId":0,"regionName":"乌鲁木齐","cityCode":650100,"pinYin":"WULUMUQI"},{"id":284,"parentId":0,"regionName":"武威","cityCode":620600,"pinYin":"WUWEI"},{"id":151,"parentId":0,"regionName":"无锡","cityCode":320200,"pinYin":"WUXI"},{"id":3666,"parentId":0,"regionName":"吴忠","cityCode":640300,"pinYin":"WUZHONG"},{"id":374,"parentId":0,"regionName":"梧州","cityCode":450400,"pinYin":"WUZHOU"}],"X":[{"id":89,"parentId":0,"regionName":"厦门","cityCode":350200,"pinYin":"XIAMEN"},{"id":46,"parentId":0,"regionName":"西安","cityCode":610100,"pinYin":"XIAN"},{"id":599,"parentId":0,"regionName":"湘潭","cityCode":430300,"pinYin":"XIANGTAN"},{"id":602,"parentId":0,"regionName":"湘西","cityCode":433100,"pinYin":"XIANGXI"},{"id":731,"parentId":0,"regionName":"襄阳","cityCode":420600,"pinYin":"XIANGYANG"},{"id":538,"parentId":0,"regionName":"咸宁","cityCode":421200,"pinYin":"XIANNING"},{"id":569,"parentId":0,"regionName":"仙桃","cityCode":429004,"pinYin":"XIANTAO"},{"id":918,"parentId":0,"regionName":"咸阳","cityCode":610400,"pinYin":"XIANYANG"},{"id":1160,"parentId":0,"regionName":"孝感","cityCode":420900,"pinYin":"XIAOGAN"},{"id":3303,"parentId":0,"regionName":"锡林郭勒","cityCode":152500,"pinYin":"XILINGUOLE"},{"id":3631,"parentId":0,"regionName":"兴安盟","cityCode":152200,"pinYin":"XINGAN"},{"id":441,"parentId":0,"regionName":"邢台","cityCode":130500,"pinYin":"XINGTAI"},{"id":3679,"parentId":3646,"regionName":"兴义市","cityCode":522301,"pinYin":"XINGYI","selected":1},{"id":814,"parentId":0,"regionName":"西宁","cityCode":630100,"pinYin":"XINING"},{"id":472,"parentId":0,"regionName":"新乡","cityCode":410700,"pinYin":"XINXIANG"},{"id":470,"parentId":0,"regionName":"信阳","cityCode":411500,"pinYin":"XINYANG"},{"id":733,"parentId":0,"regionName":"新余","cityCode":360500,"pinYin":"XINYU"},{"id":3432,"parentId":0,"regionName":"忻州","cityCode":140900,"pinYin":"XINZHOU"},{"id":1010,"parentId":0,"regionName":"西双版纳","cityCode":532800,"pinYin":"XISHUANGBANNA"},{"id":224,"parentId":0,"regionName":"宣城","cityCode":341800,"pinYin":"XUANCHENG"},{"id":477,"parentId":0,"regionName":"许昌","cityCode":411000,"pinYin":"XUCHANG"},{"id":95,"parentId":0,"regionName":"徐州","cityCode":320300,"pinYin":"XUZHOU"}],"Y":[{"id":3438,"parentId":0,"regionName":"雅安","cityCode":511800,"pinYin":"YAAN"},{"id":912,"parentId":0,"regionName":"延安","cityCode":610600,"pinYin":"YANAN"},{"id":3634,"parentId":0,"regionName":"延边","cityCode":222400,"pinYin":"YANBIAN"},{"id":642,"parentId":0,"regionName":"盐城","cityCode":320900,"pinYin":"YANCHENG"},{"id":329,"parentId":0,"regionName":"阳江","cityCode":441700,"pinYin":"YANGJIANG"},{"id":5750,"parentId":0,"regionName":"洋浦","cityCode":469000,"pinYin":"YANGPU"},{"id":1195,"parentId":0,"regionName":"阳泉","cityCode":140300,"pinYin":"YANGQUAN"},{"id":660,"parentId":0,"regionName":"扬州","cityCode":321000,"pinYin":"YANGZHOU"},{"id":105,"parentId":0,"regionName":"烟台","cityCode":370600,"pinYin":"YANTAI"},{"id":949,"parentId":0,"regionName":"宜宾","cityCode":511500,"pinYin":"YIBIN"},{"id":565,"parentId":0,"regionName":"宜昌","cityCode":420500,"pinYin":"YICHANG"},{"id":3463,"parentId":0,"regionName":"伊春","cityCode":230700,"pinYin":"YICHUN"},{"id":716,"parentId":0,"regionName":"宜春","cityCode":360900,"pinYin":"YICHUN"},{"id":1104,"parentId":0,"regionName":"伊犁","cityCode":654000,"pinYin":"YILI"},{"id":810,"parentId":0,"regionName":"银川","cityCode":640100,"pinYin":"YINCHUAN"},{"id":774,"parentId":0,"regionName":"营口","cityCode":210800,"pinYin":"YINGKOU"},{"id":1170,"parentId":0,"regionName":"鹰潭","cityCode":360600,"pinYin":"YINGTAN"},{"id":4636,"parentId":151,"regionName":"宜兴市","cityCode":320282,"pinYin":"YIXINGSHI","selected":1},{"id":605,"parentId":0,"regionName":"益阳","cityCode":430900,"pinYin":"YIYANG"},{"id":1164,"parentId":0,"regionName":"永州","cityCode":431100,"pinYin":"YONGZHOU"},{"id":607,"parentId":0,"regionName":"岳阳","cityCode":430600,"pinYin":"YUEYANG"},{"id":378,"parentId":0,"regionName":"玉林","cityCode":450900,"pinYin":"YULIN"},{"id":914,"parentId":0,"regionName":"榆林","cityCode":610800,"pinYin":"YULIN"},{"id":888,"parentId":0,"regionName":"运城","cityCode":140800,"pinYin":"YUNCHENG"},{"id":332,"parentId":0,"regionName":"云浮","cityCode":445300,"pinYin":"YUNFU"},{"id":3664,"parentId":0,"regionName":"玉树","cityCode":632700,"pinYin":"YUSHU"},{"id":1012,"parentId":0,"regionName":"玉溪","cityCode":530400,"pinYin":"YUXI"}],"Z":[{"id":857,"parentId":0,"regionName":"枣庄","cityCode":370400,"pinYin":"ZAOZHUANG"},{"id":1236,"parentId":0,"regionName":"张家界","cityCode":430800,"pinYin":"ZHANGGUJIE"},{"id":443,"parentId":0,"regionName":"张家口","cityCode":130700,"pinYin":"ZHANGJIAKOU"},{"id":286,"parentId":0,"regionName":"张掖","cityCode":620700,"pinYin":"ZHANGYE"},{"id":243,"parentId":0,"regionName":"漳州","cityCode":350600,"pinYin":"ZHANGZHOU"},{"id":334,"parentId":0,"regionName":"湛江","cityCode":440800,"pinYin":"ZHANJIANG"},{"id":337,"parentId":0,"regionName":"肇庆","cityCode":441200,"pinYin":"ZHAOQING"},{"id":3649,"parentId":0,"regionName":"昭通","cityCode":530600,"pinYin":"ZHAOTONG"},{"id":43,"parentId":0,"regionName":"郑州","cityCode":410100,"pinYin":"ZHENGZHOU"},{"id":657,"parentId":0,"regionName":"镇江","cityCode":321100,"pinYin":"ZHENJIANG"},{"id":339,"parentId":0,"regionName":"中山","cityCode":442000,"pinYin":"ZHONGSHAN"},{"id":1184,"parentId":0,"regionName":"中卫","cityCode":640500,"pinYin":"ZHONGWEI"},{"id":93,"parentId":0,"regionName":"周口","cityCode":411600,"pinYin":"ZHOUKOU"},{"id":1055,"parentId":0,"regionName":"舟山","cityCode":330900,"pinYin":"ZHOUSHAN"},{"id":346,"parentId":0,"regionName":"珠海","cityCode":440400,"pinYin":"ZHUHAI"},{"id":484,"parentId":0,"regionName":"驻马店","cityCode":411700,"pinYin":"ZHUMADIAN"},{"id":597,"parentId":0,"regionName":"株洲","cityCode":430200,"pinYin":"ZHUZHOU"},{"id":860,"parentId":0,"regionName":"淄博","cityCode":370300,"pinYin":"ZIBO"},{"id":955,"parentId":0,"regionName":"自贡","cityCode":510300,"pinYin":"ZIGONG"},{"id":957,"parentId":0,"regionName":"资阳","cityCode":512000,"pinYin":"ZIYANG"},{"id":403,"parentId":0,"regionName":"遵义","cityCode":520300,"pinYin":"ZUNYI"}]}}
import urllib.request url = 'https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1629789477003_137&jsoncallback=jsonp138&action=cityAction&n_s=new&event_submit_doGetAllRegion=true' headers = { # ':authority': 'dianying.taobao.com', # ':method': 'GET', # ':path': '/cityAction.json?activityId&_ksTS=1629789477003_137&jsoncallback=jsonp138&action=cityAction&n_s=new&event_submit_doGetAllRegion=true', # ':scheme': 'https', 'accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01', # 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cookie': 'cna=UkO6F8VULRwCAXTqq7dbS5A8; miid=949542021157939863; sgcookie=E100F01JK9XMmyoZRigjfmZKExNdRHQqPf4v9NIWIC1nnpnxyNgROLshAf0gz7lGnkKvwCnu1umyfirMSAWtubqc4g%3D%3D; tracknick=action_li; _cc_=UIHiLt3xSw%3D%3D; enc=dA18hg7jG1xapfVGPHoQCAkPQ4as1%2FEUqsG4M6AcAjHFFUM54HWpBv4AAm0MbQgqO%2BiZ5qkUeLIxljrHkOW%2BtQ%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; _m_h5_tk=3ca69de1b9ad7dce614840fcd015dcdb_1629776735568; _m_h5_tk_enc=ab56df54999d1d2cac2f82753ae29f82; t=874e6ce33295bf6b95cfcfaff0af0db6; xlly_s=1; cookie2=13acd8f4dafac4f7bd2177d6710d60fe; v=0; _tb_token_=e65ebbe536158; tfstk=cGhRB7mNpnxkDmUx7YpDAMNM2gTGZbWLxUZN9U4ulewe025didli6j5AFPI8MEC..; l=eBrgmF1cOsMXqSxaBO5aFurza77tzIRb8sPzaNbMiInca6OdtFt_rNCK2Ns9SdtjgtfFBetPVKlOcRCEF3apbgiMW_N-1NKDSxJ6-; isg=BBoas2yXLzHdGp3pCh7XVmpja8A8S54lyLj1RySTHq14l7vRDNufNAjpZ2MLRxa9', 'referer': 'https://dianying.taobao.com/', 'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"', 'sec-ch-ua-mobile': '?0', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36', 'x-requested-with': 'XMLHttpRequest', } request = urllib.request.Request(url = url, headers = headers) response = urllib.request.urlopen(request) content = response.read().decode('utf-8') # split 切割 content = content.split('(')[1].split(')')[0] with open('爬虫_解析_jsonpath解析淘票票.json','w',encoding='utf-8')as fp: fp.write(content) import json import jsonpath obj = json.load(open('爬虫_解析_jsonpath解析淘票票.json','r',encoding='utf-8')) city_list = jsonpath.jsonpath(obj,'$..regionName') print(city_list)
Title 百度
- 张三
- 李四
- 王五
123 嘿嘿嘿哈哈哈呵呵呵
from bs4 import BeautifulSoup # 通过解析本地文件 来将bs4的基础语法进行讲解 # 默认打开的文件的编码格式是gbk 所以在打开文件的时候需要指定编码 soup = BeautifulSoup(open('爬虫_解析_bs4的基本使用.html',encoding='utf-8'),'lxml') # 根据标签名查找节点 # 找到的是第一个符合条件的数据 # print(soup.a) # 获取标签的属性和属性值 # print(soup.a.attrs) # bs4的一些函数 # (1)find # 返回的是第一个符合条件的数据 # print(soup.find('a')) # 根据title的值来找到对应的标签对象 # print(soup.find('a',title="a2")) # 根据class的值来找到对应的标签对象 注意的是class需要添加下划线 # print(soup.find('a',class_="a1")) # (2)find_all 返回的是一个列表 并且返回了所有的a标签 # print(soup.find_all('a')) # 如果想获取的是多个标签的数据 那么需要在find_all的参数中添加的是列表的数据 # print(soup.find_all(['a','span'])) # limit的作用是查找前几个数据 # print(soup.find_all('li',limit=2)) # (3)select(推荐) # select方法返回的是一个列表 并且会返回多个数据 # print(soup.select('a')) # 可以通过.代表class 我们把这种操作叫做类选择器 # print(soup.select('.a1')) # print(soup.select('#l1')) # 属性选择器---通过属性来寻找对应的标签 # 查找到li标签中有id的标签 # print(soup.select('li[id]')) # 查找到li标签中id为l2的标签 # print(soup.select('li[id="l2"]')) # 层级选择器 # 后代选择器 # 找到的是div下面的li # print(soup.select('div li')) # 子代选择器 # 某标签的第一级子标签 # 注意:很多的计算机编程语言中 如果不加空格不会输出内容 但是在bs4中 不会报错 会显示内容 # print(soup.select('div > ul > li')) # 找到a标签和li标签的所有的对象 # print(soup.select('a,li')) # 节点信息 # 获取节点内容 # obj = soup.select('#d1')[0] # 如果标签对象中 只有内容 那么string和get_text()都可以使用 # 如果标签对象中 除了内容还有标签 那么string就获取不到数据 而get_text()是可以获取数据 # 我们一般情况下 推荐使用get_text() # print(obj.string) # print(obj.get_text()) # 节点的属性 # obj = soup.select('#p1')[0] # name是标签的名字 # print(obj.name) # 将属性值左右一个字典返回 # print(obj.attrs) # 获取节点的属性 obj = soup.select('#p1')[0] print(obj.attrs.get('class')) print(obj.get('class')) print(obj['class'])
import urllib.request url = 'https://www.starbucks.com.cn/menu/' response = urllib.request.urlopen(url) content = response.read().decode('utf-8') from bs4 import BeautifulSoup soup = BeautifulSoup(content,'lxml') # //ul[@]//strong/text() name_list = soup.select('ul[] strong') for name in name_list: print(name.get_text())
import urllib.request url = 'https://www.jd.com/' response = urllib.request.urlopen(url) content = response.read().decode('utf-8') print(content)
# (1)导入selenium from selenium import webdriver # (2) 创建浏览器操作对象 path = 'chromedriver.exe' browser = webdriver.Chrome(path) # (3)访问网站 # url = 'https://www.baidu.com' # # browser.get(url) url = 'https://www.jd.com/' browser.get(url) # page_source获取网页源码 content = browser.page_source print(content)
from selenium import webdriver path = 'chromedriver.exe' browser = webdriver.Chrome(path) url = 'https://www.baidu.com' browser.get(url) # 元素定位 # 根据id来找到对象 # button = browser.find_element_by_id('su') # print(button) # 根据标签属性的属性值来获取对象的 # button = browser.find_element_by_name('wd') # print(button) # 根据xpath语句来获取对象 # button = browser.find_elements_by_xpath('//input[@id="su"]') # print(button) # 根据标签的名字来获取对象 # button = browser.find_elements_by_tag_name('input') # print(button) # 使用的bs4的语法来获取对象 # button = browser.find_elements_by_css_selector('#su') # print(button) # button = browser.find_element_by_link_text('直播') # print(button)
from selenium import webdriver path = 'chromedriver.exe' browser = webdriver.Chrome(path) url = 'http://www.baidu.com' browser.get(url) input = browser.find_element_by_id('su') # 获取标签的属性 print(input.get_attribute('class')) # 获取标签的名字 print(input.tag_name) # 获取元素文本 a = browser.find_element_by_link_text('新闻') print(a.text)
from selenium import webdriver # 创建浏览器对象 path = 'chromedriver.exe' browser = webdriver.Chrome(path) # url url = 'https://www.baidu.com' browser.get(url) import time time.sleep(2) # 获取文本框的对象 input = browser.find_element_by_id('kw') # 在文本框中输入周杰伦 input.send_keys('周杰伦') time.sleep(2) # 获取百度一下的按钮 button = browser.find_element_by_id('su') # 点击按钮 button.click() time.sleep(2) # 滑到底部 js_bottom = 'document.documentElement.scrollTop=100000' browser.execute_script(js_bottom) time.sleep(2) # 获取下一页的按钮 next = browser.find_element_by_xpath('//a[@]') # 点击下一页 next.click() time.sleep(2) # 回到上一页 browser.back() time.sleep(2) # 回去 browser.forward() time.sleep(3) # 退出 browser.quit()
from selenium import webdriver path = 'phantomjs.exe' browser = webdriver.PhantomJS(path) url = 'https://www.baidu.com' browser.get(url) browser.save_screenshot('baidu.png') import time time.sleep(2) input = browser.find_element_by_id('kw') input.send_keys('昆凌') time.sleep(3) browser.save_screenshot('kunling.png')
# from selenium import webdriver # from selenium.webdriver.chrome.options import Options # # chrome_options = Options() # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') # # # path是你自己的chrome浏览器的文件路径 # path = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe' # chrome_options.binary_location = path # # browser = webdriver.Chrome(chrome_options=chrome_options) # # # url = 'https://www.baidu.com' # # browser.get(url) # # browser.save_screenshot('baidu.png') # 封装的handless from selenium import webdriver from selenium.webdriver.chrome.options import Options def share_browser(): chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') # path是你自己的chrome浏览器的文件路径 path = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe' chrome_options.binary_location = path browser = webdriver.Chrome(chrome_options=chrome_options) return browser browser = share_browser() url = 'https://www.baidu.com' browser.get(url)
import requests url = 'http://www.baidu.com' response = requests.get(url=url) # 一个类型和六个属性 # Response类型 # print(type(response)) # 设置响应的编码格式 # response.encoding = 'utf-8' # 以字符串的形式来返回了网页的源码 # print(response.text) # 返回一个url地址 # print(response.url) # 返回的是二进制的数据 # print(response.content) # 返回响应的状态码 # print(response.status_code) # 返回的是响应头 print(response.headers)
# urllib # (1) 一个类型以及六个方法 # (2)get请求 # (3)post请求 百度翻译 # (4)ajax的get请求 # (5)ajax的post请求 # (6)cookie登陆 微博 # (7)代理 # requests # (1)一个类型以及六个属性 # (2)get请求 # (3)post请求 # (4)代理 # (5)cookie 验证码 import requests url = 'https://www.baidu.com/s' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } data = { 'wd':'北京' } # url 请求资源路径 # params 参数 # kwargs 字典 response = requests.get(url=url,params=data,headers=headers) content = response.text print(content) # 总结: # (1)参数使用params传递 # (2)参数无需urlencode编码 # (3)不需要请求对象的定制 # (4)请求资源路径中的?可以加也可以不加
import requests url = 'https://fanyi.baidu.com/sug' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } data = { 'kw': 'eye' } # url 请求地址 # data 请求参数 # kwargs 字典 response = requests.post(url=url,data=data,headers=headers) content =response.text import json obj = json.loads(content,encoding='utf-8') print(obj) # 总结: # (1)post请求 是不需要编解码 # (2)post请求的参数是data # (3)不需要请求对象的定制
import requests url = 'http://www.baidu.com/s?' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36', } data = { 'wd':'ip' } proxy = { 'http':'212.129.251.55:16816' } response = requests.get(url = url,params=data,headers = headers,proxies = proxy) content = response.text with open('daili.html','w',encoding='utf-8')as fp: fp.write(content)
# 通过登陆 然后进入到主页面 # 通过找登陆接口我们发现 登陆的时候需要的参数很多 # _VIEWSTATE: /m1O5dxmOo7f1qlmvtnyNyhhaUrWNVTs3TMKIsm1lvpIgs0WWWUCQHl5iMrvLlwnsqLUN6Wh1aNpitc4WnOt0So3k6UYdFyqCPI6jWSvC8yBA1Q39I7uuR4NjGo= # __VIEWSTATEGENERATOR: C93BE1AE # from: http://so.gushiwen.cn/user/collect.aspx # email: 595165358@qq.com # pwd: action # code: PId7 # denglu: 登录 # 我们观察到_VIEWSTATE __VIEWSTATEGENERATOR code是一个可以变化的量 # 难点:(1)_VIEWSTATE __VIEWSTATEGENERATOR 一般情况看不到的数据 都是在页面的源码中 # 我们观察到这两个数据在页面的源码中 所以我们需要获取页面的源码 然后进行解析就可以获取了 # (2)验证码 import requests # 这是登陆页面的url地址 url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' } # 获取页面的源码 response = requests.get(url = url,headers = headers) content = response.text # 解析页面源码 然后获取_VIEWSTATE __VIEWSTATEGENERATOR from bs4 import BeautifulSoup soup = BeautifulSoup(content,'lxml') # 获取_VIEWSTATE viewstate = soup.select('#__VIEWSTATE')[0].attrs.get('value') # 获取__VIEWSTATEGENERATOR viewstategenerator = soup.select('#__VIEWSTATEGENERATOR')[0].attrs.get('value') # 获取验证码图片 code = soup.select('#imgCode')[0].attrs.get('src') code_url = 'https://so.gushiwen.cn' + code # 有坑 # import urllib.request # urllib.request.urlretrieve(url=code_url,filename='code.jpg') # requests里面有一个方法 session() 通过session的返回值 就能使用请求变成一个对象 session = requests.session() # 验证码的url的内容 response_code = session.get(code_url) # 注意此时要使用二进制数据 因为我们要使用的是图片的下载 content_code = response_code.content # wb的模式就是将二进制数据写入到文件 with open('code.jpg','wb')as fp: fp.write(content_code) # 获取了验证码的图片之后 下载到本地 然后观察验证码 观察之后 然后在控制台输入这个验证码 就可以将这个值给 # code的参数 就可以登陆 code_name = input('请输入你的验证码') # 点击登陆 url_post = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx' data_post = { '__VIEWSTATE': viewstate, '__VIEWSTATEGENERATOR': viewstategenerator, 'from': 'http://so.gushiwen.cn/user/collect.aspx', 'email': '595165358@qq.com', 'pwd': '.....', 'code': code_name, 'denglu': '登录', } response_post = session.post(url = url, headers = headers, data = data_post) content_post = response_post.text with open('gushiwen.html','w',encoding= ' utf-8')as fp: fp.write(content_post) # 难点 # (1) 隐藏域 # (2) 验证码
#!/usr/bin/env python # coding:utf-8 import requests from hashlib import md5 class Chaojiying_Client(object): def __init__(self, username, password, soft_id): self.username = username password = password.encode('utf8') self.password = md5(password).hexdigest() self.soft_id = soft_id self.base_params = { 'user': self.username, 'pass2': self.password, 'softid': self.soft_id, } self.headers = { 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } def PostPic(self, im, codetype): """ im: 图片字节 codetype: 题目类型 参考 http://www.chaojiying.com/price.html """ params = { 'codetype': codetype, } params.update(self.base_params) files = {'userfile': ('ccc.jpg', im)} r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers) return r.json() def ReportError(self, im_id): """ im_id:报错题目的图片ID """ params = { 'id': im_id, } params.update(self.base_params) r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) return r.json() if __name__ == '__main__': chaojiying = Chaojiying_Client('超级鹰用户名', '超级鹰用户名的密码', '96001') #用户中心>>软件ID 生成一个替换 96001 im = open('a.jpg', 'rb').read() #本地图片文件路径 来替换 a.jpg 有时WIN系统须要// print(chaojiying.PostPic(im, 1902)) #1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加()
# (1) pip install scrapy # (2) 报错1: building 'twisted.test.raiser' extension # error: Microsoft Visual C++ 14.0 is required. Get it with "Microsoft Visual C++ # Build Tools": http://landinghub.visualstudio.com/visual-cpp-build-tools # 解决1 # http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted # Twisted‑20.3.0‑cp37‑cp37m‑win_amd64.whl # cp是你的python版本 # amd是你的操作系统的版本 # 下载完成之后 使用pip install twisted的路径 安装 # 切记安装完twisted 再次安装scrapy # (3) 报错2 提示python -m pip install --upgrade pip # 解决2 运行python -m pip install --upgrade pip # (4) 报错3 win32的错误 # 解决3 pip install pypiwin32 # (5) anaconda
# 进入到scrapy shell的终端 直接在window的终端中输入scrapy shell 域名 # 如果想看到一些高亮 或者 自动补全 那么可以安装ipython pip install ipython # scrapy shell www.baidu.com