文章目录
- python爬虫 - 爬取html格式数据(CDSN博客)
- 1. 第一步:安装requests库和BeautifulSoup库
- 2. 第二步:获取爬虫所需的header和cookie
- 3. 第三步:获取网页
- 4. 第四步:解析网页
- 5. 第五步:分析得到的信息,简化地址
- 6. 第六步:爬取内容,清洗数据
- 7. 爬取微博热搜的代码实例以及结果展示
python爬虫 - 爬取html格式数据(CDSN博客)
python爬虫六部曲:
-
第一步:安装requests库和BeautifulSoup库
-
第二步:获取爬虫所需的header和cookie
-
第三步:获取网页
-
第四步:解析网页
-
第五步:分析得到的信息,简化地址:
-
第六步:爬取内容,清洗数据
1. 第一步:安装requests库和BeautifulSoup库
在程序中引用两个库的书写是这样的:
import requests
from bs4 import BeautifulSoup
以pycharm为例,在pycharm上安装这两个库的方法。在菜单【文件】–>【设置】->【项目】–>【Python解释器】中,在所选框中,点击软件包上的+号就可以进行查询插件安装了。有过编译器插件安装的hxd估计会比较好入手。具体情况就如下图所示。
2. 第二步:获取爬虫所需的header和cookie
以爬取 cdsn 的爬虫程序为例。
获取header和cookie是一个爬虫程序必须的,它直接决定了爬虫程序能不能准确的找到网页位置进行爬取。
- 首先进入 cdsn 的页面(https://blog.csdn.net/BullKing8185?type=blog),按下F12,就会出现网页的js语言设计部分,找到网页上的Network部分。如下图所示:
- 然后按下ctrl+R刷新页面,此时发现右边 NetWork 部分出现很多信息。
(如果进入后就有所需要的信息,就不用刷新了),当然刷新了也没啥问题。
-
过滤网络信息,并拷贝其 cURL 信息 ,
在 Network --> Filter 中,依据网址(https://blog.csdn.net/BullKing8185?type=blog)中的关键信息(如:BullKing8185) 或者 页面上的关键信息(如:python爬虫) 进行过滤 。 然后,我们浏览Name这部分,找到我们想要爬取的网络信息,鼠标右键,选择copy,复制下网页的URL。过滤后,有效信息会少很多,如下所示。选中所需的条目,右键 --> Copy --> Copy as cURL
- 利用工具 Convert curl commands to code https://curlconverter.com/python/ 进行转换
转换后信息如下图所示,选择【Copy to clipboard】,并黏贴到Pycharm开发环境中即可直接使用:
拷贝到 pycharm 等集成代码编辑软件中,可直接作为源代码使用.
import requests
cookies = {
'uuid_tt_dd': '10_20936681940-1687695659941-897712',
'UN': 'limeigui',
'ins_first_time': '1693811332416',
'_ga': 'GA1.1.1606228358.1692240870',
'_ga_7W1N0GEY1P': 'GS1.1.1698749086.7.1.1698749172.43.0.0',
'log_Id_click': '117',
'log_Id_view': '558',
'log_Id_pv': '378',
'Hm_lvt_e5ef47b9f471504959267fd614d579cd': '1708345516',
'ssxmod_itna': 'iqGxuDBD2AKrqGHqaWxUhGQqi=Z+xeDk+Dmg04GNpUYDZDiqAPGhDC38FmBm0jwIdf4804GCi2bqxaAS77gpAIKz2mpYD74i8DCqi1D0qDY+oxBLrbQoxiiyDCmFDPrKD32xlIzDvxG=D3qDFYqDLDMNDFqG0l+QPD0Pq+mDlD73DUwdDQqDSUQKKxGjDxitRDGADx0tUD7jD2eQDeMpTcqGW0wD2zBh8YYaSR=y4cjTiP6WW5cWU7ZnCaONVemQDbRLHweXfxQ0CNODvmAvzSGPqWDhw30Gc+7xeU+1SwA/riBqeRf+3YDDGbxeA4bix4D=',
'ssxmod_itna2': 'iqGxuDBD2AKrqGHqaWxUhGQqi=Z+xeDk+DmgDA6WmhxD/Q1DFr21/4Pgp7KAPPuKOBaiGcS0MH0QPvcM+RwKb9uvohWzcgyAOYiPq2NDgQdjoj/l8LYmsENvW2Ax0MEGC2eVUWk53KncOecGsZ7RYU2PWXW0TcABidaTWCDo1XCnrdYPFIAGIzgqHPYSN48+zk+p4IAErxE0tHrOIFokWcFdKt+o87hYQ1YYzhbIsEf7d78O0u7iAADObNMGIfZE+/YBr3mnIRK3Uiqwa4IguIDw1cfH9iTGTT6qgO8KXrczLohsiEZ+2GvH2A3Z4uGvAeelAYMODZnqKfmznxYzIOnx4l3eZP7hwuAdMl6Yh=Q35NSpa=IrK6bS50OwVl5=O5Mjt72PSnrsB5u7482pFPPlWtplXa6ihBrMiPwFq7CeeaXdU6G0n2Z2KjWAhaMeRvKNNRG8jfFcqqn0OYRn8IzOQbt0O9quQD1PeXWPY1=MEpu1KjSUg2I4roD8NDL=E0ePVyofFig7Hm4DQI4zi59hpwnjtqkUCIpcYBC+0hlnlyUhQ0q4UGzKdf6o=K0YD08DiQ4YD===',
'tfstk': 'eoFMjtVQpLYXhdd7Jlh19IAf0TBKBhGjcodxDjnVLDoC5S3OC-00xlvtHR3TKxqQAlHVfc30nDqLBIhaiJO0PrNO5jQs1PGjggIRwI4_5jNZPRglwtKlqssR2_CdQelcjgetASaRYOsdBv5aaND-qUxFULPB77oz7czTWW8IQdatxIR4tJDS4imHgIPnSglyLLoTlIgFkWJXhAuI-0QMz_Nc-ETuCwbHFOMZRVIR-wvX8AuI-kQh-LTKQ2gtp',
'c_dl_um': '-',
'UserName': 'limeigui',
'UserInfo': '3b95b21938904a148617bb63e4cd8b47',
'UserToken': '3b95b21938904a148617bb63e4cd8b47',
'UserNick': 'Adunn',
'AU': '95B',
'BT': '1709514903656',
'p_uid': 'U010000',
'Hm_up_6bcd52f51e9b3dce32bec4a3997715ac': '{"islogin":{"value":"1","scope":1},"isonline":{"value":"1","scope":1},"isvip":{"value":"0","scope":1},"uid_":{"value":"limeigui","scope":1}}',
'FCNEC': '[["AKsRol-81BVROqAOt-Krga723o1zn0lKQZXCHVYOsZGp4bbSYqIORsTWdRA33h_JQeCm1pUeYYkPLifSrDfR3ebNunu-COvYV2D4sqzbPrXD_tVp9je9p1aG1qgVGpkYlxpNK3mEnUaabXB6IvFt7xBeYqz_845gCA=="]]',
'c_dl_fref': 'https://www.baidu.com/link',
'c_dl_fpage': '/download/qq_27308505/21132392',
'c_dl_prid': '1711094139943_541172',
'c_dl_rid': '1711094192991_543576',
'limeiguicomment_new': '1706325449636',
'management_ques': '1712733666636',
'c_segment': '0',
'Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac': '1712450998,1712540146,1712628949,1713335190',
'_clck': 'v3yta0|2|fl2|0|1559',
'log_Id_click': '118',
'c_pref': 'default',
'c_first_ref': 'default',
'dc_sid': '9a3d776b6aa46403c7fa216f2ec8d588',
'creative_btn_mp': '3',
'fpv': 'd7fa221d14d24b2361ad4f09bace739a',
'yd_captcha_token': 'dzp',
'dc_session_id': '10_1713523054888.848382',
'c_ref': 'default',
'log_Id_pv': '379',
'log_Id_view': '559',
'__gads': 'ID=79df0b17ce2ed235-22263755acb40040:T=1687695664:RT=1713524387:S=ALNI_MbKkmtHbLa1eh1RSXbuJOoatVdiiQ',
'__gpi': 'UID=00000c6ade25eb9f:T=1687695664:RT=1713524387:S=ALNI_MZP0oVAi-DQb_-PpFvwoGO0EYhHiQ',
'__eoi': 'ID=4a7618f393a07404:T=1706249283:RT=1713524387:S=AA-AfjbTLqbpP5gZ44TgXYWkx20B',
'_clsk': '1k295vw|1713524388493|4|0|n.clarity.ms/collect',
'SidecHatdocDescBoxNum': 'true',
'c_first_page': 'https://blog.csdn.net/BullKing8185?type=blog',
'c_dsid': '11_1713525600201.216918',
'c_page_id': 'default',
'Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac': '1713525601',
'dc_tos': 'sc6t49',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
# 'Cookie': 'uuid_tt_dd=10_20936681940-1687695659941-897712; UN=limeigui; ins_first_time=1693811332416; _ga=GA1.1.1606228358.1692240870; _ga_7W1N0GEY1P=GS1.1.1698749086.7.1.1698749172.43.0.0; log_Id_click=117; log_Id_view=558; log_Id_pv=378; Hm_lvt_e5ef47b9f471504959267fd614d579cd=1708345516; ssxmod_itna=iqGxuDBD2AKrqGHqaWxUhGQqi=Z+xeDk+Dmg04GNpUYDZDiqAPGhDC38FmBm0jwIdf4804GCi2bqxaAS77gpAIKz2mpYD74i8DCqi1D0qDY+oxBLrbQoxiiyDCmFDPrKD32xlIzDvxG=D3qDFYqDLDMNDFqG0l+QPD0Pq+mDlD73DUwdDQqDSUQKKxGjDxitRDGADx0tUD7jD2eQDeMpTcqGW0wD2zBh8YYaSR=y4cjTiP6WW5cWU7ZnCaONVemQDbRLHweXfxQ0CNODvmAvzSGPqWDhw30Gc+7xeU+1SwA/riBqeRf+3YDDGbxeA4bix4D=; ssxmod_itna2=iqGxuDBD2AKrqGHqaWxUhGQqi=Z+xeDk+DmgDA6WmhxD/Q1DFr21/4Pgp7KAPPuKOBaiGcS0MH0QPvcM+RwKb9uvohWzcgyAOYiPq2NDgQdjoj/l8LYmsENvW2Ax0MEGC2eVUWk53KncOecGsZ7RYU2PWXW0TcABidaTWCDo1XCnrdYPFIAGIzgqHPYSN48+zk+p4IAErxE0tHrOIFokWcFdKt+o87hYQ1YYzhbIsEf7d78O0u7iAADObNMGIfZE+/YBr3mnIRK3Uiqwa4IguIDw1cfH9iTGTT6qgO8KXrczLohsiEZ+2GvH2A3Z4uGvAeelAYMODZnqKfmznxYzIOnx4l3eZP7hwuAdMl6Yh=Q35NSpa=IrK6bS50OwVl5=O5Mjt72PSnrsB5u7482pFPPlWtplXa6ihBrMiPwFq7CeeaXdU6G0n2Z2KjWAhaMeRvKNNRG8jfFcqqn0OYRn8IzOQbt0O9quQD1PeXWPY1=MEpu1KjSUg2I4roD8NDL=E0ePVyofFig7Hm4DQI4zi59hpwnjtqkUCIpcYBC+0hlnlyUhQ0q4UGzKdf6o=K0YD08DiQ4YD===; tfstk=eoFMjtVQpLYXhdd7Jlh19IAf0TBKBhGjcodxDjnVLDoC5S3OC-00xlvtHR3TKxqQAlHVfc30nDqLBIhaiJO0PrNO5jQs1PGjggIRwI4_5jNZPRglwtKlqssR2_CdQelcjgetASaRYOsdBv5aaND-qUxFULPB77oz7czTWW8IQdatxIR4tJDS4imHgIPnSglyLLoTlIgFkWJXhAuI-0QMz_Nc-ETuCwbHFOMZRVIR-wvX8AuI-kQh-LTKQ2gtp; c_dl_um=-; UserName=limeigui; UserInfo=3b95b21938904a148617bb63e4cd8b47; UserToken=3b95b21938904a148617bb63e4cd8b47; UserNick=Adunn; AU=95B; BT=1709514903656; p_uid=U010000; Hm_up_6bcd52f51e9b3dce32bec4a3997715ac={"islogin":{"value":"1","scope":1},"isonline":{"value":"1","scope":1},"isvip":{"value":"0","scope":1},"uid_":{"value":"limeigui","scope":1}}; FCNEC=[["AKsRol-81BVROqAOt-Krga723o1zn0lKQZXCHVYOsZGp4bbSYqIORsTWdRA33h_JQeCm1pUeYYkPLifSrDfR3ebNunu-COvYV2D4sqzbPrXD_tVp9je9p1aG1qgVGpkYlxpNK3mEnUaabXB6IvFt7xBeYqz_845gCA=="]]; c_dl_fref=https://www.baidu.com/link; c_dl_fpage=/download/qq_27308505/21132392; c_dl_prid=1711094139943_541172; c_dl_rid=1711094192991_543576; limeiguicomment_new=1706325449636; management_ques=1712733666636; c_segment=0; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1712450998,1712540146,1712628949,1713335190; _clck=v3yta0|2|fl2|0|1559; log_Id_click=118; c_pref=default; c_first_ref=default; dc_sid=9a3d776b6aa46403c7fa216f2ec8d588; creative_btn_mp=3; fpv=d7fa221d14d24b2361ad4f09bace739a; yd_captcha_token=dzp; dc_session_id=10_1713523054888.848382; c_ref=default; log_Id_pv=379; log_Id_view=559; __gads=ID=79df0b17ce2ed235-22263755acb40040:T=1687695664:RT=1713524387:S=ALNI_MbKkmtHbLa1eh1RSXbuJOoatVdiiQ; __gpi=UID=00000c6ade25eb9f:T=1687695664:RT=1713524387:S=ALNI_MZP0oVAi-DQb_-PpFvwoGO0EYhHiQ; __eoi=ID=4a7618f393a07404:T=1706249283:RT=1713524387:S=AA-AfjbTLqbpP5gZ44TgXYWkx20B; _clsk=1k295vw|1713524388493|4|0|n.clarity.ms/collect; SidecHatdocDescBoxNum=true; c_first_page=https://blog.csdn.net/BullKing8185?type=blog; c_dsid=11_1713525600201.216918; c_page_id=default; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1713525601; dc_tos=sc6t49',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Linux"',
}
params = {
'type': 'blog',
}
response = requests.get('https://blog.csdn.net/BullKing8185', params=params, cookies=cookies, headers=headers)
3. 第三步:获取网页
通过requests.get() 即可获取网页内容:
response = requests.get('https://blog.csdn.net/BullKing8185', params=params, cookies=cookies, headers=headers)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
print(f'soup value= {soup}')
4. 第四步:解析网页
这个时候,我们需要回到网页: 【按下F12】–> 【找到网页的Elements部分 --> 【选中左上角的小框带箭头的标志】,进入【内容选择模式】,如下图,当点击(或鼠标移动到)对应网页内容时,这个时候网页就会自动在右边显示出你获取网页部分对应的代码。
在找到想要爬取的页面部分的网页内容后,在相应条目(如:“python爬虫-爬取图片”)上右键,就会退出【内容选择模式】。
然后将鼠标放置于 【Element】中对应的代码上,右键 -> copy --> selector。就如图所示。
拷贝后获取到的信息如下:
#userSkin > div.user-profile-body > div > div.user-profile-body-right > div.navList-box > div.mainContent > div > article:nth-child(2) > a > div.list-box-cont > div:nth-child(1) > div.blog-list-box-top > h4
再次拷贝另外一个 文章的信息(python爬虫 - 爬取微博热搜数据):
#userSkin > div.user-profile-body > div > div.user-profile-body-right > div.navList-box > div.mainContent > div > article:nth-child(3) > a > div.list-box-cont > div:nth-child(1) > div.blog-list-box-top > h4
5. 第五步:分析得到的信息,简化地址
黏贴到文本文件中信息如下:
#userSkin > div.user-profile-body > div > div.user-profile-body-right > div.navList-box > div.mainContent > div > article:nth-child(2) > a > div.list-box-cont > div:nth-child(1) > div.blog-list-box-top > h4
同理再黏贴:
#userSkin > div.user-profile-body > div > div.user-profile-body-right > div.navList-box > div.mainContent > div > article:nth-child(3) > a > div.list-box-cont > div:nth-child(1) > div.blog-list-box-top > h4
其实刚才复制的selector就相当于网页上对应部分存放的地址。由于我们需要的是网页上的一类信息,所以我们需要对获取的地址进行分析,提取。
当然,就用那个地址也是可行的,就是只能获取到你选择的网页上的那部分内容。
可以发现几个地址有很多相同的地方,唯一不同的地方就是 article 部分。由于 article 是网页标签,后面的部分就是其补充的部分,也就是子类选择器。可以推断出,该类信息,就是存储在 article 的子类中,我们直接对tr进行信息提取,就可以获取到该部分对应的所有信息。所以提炼后的地址为:
#userSkin > div.user-profile-body > div > div.user-profile-body-right > div.navList-box > div.mainContent > div > article > a > div.list-box-cont > div:nth-child(1) > div.blog-list-box-top > h4
这个过程对js类语言有一定了解的hxd估计会更好处理。不过没有js类语言基础也没关系,主要步骤就是,保留相同的部分就行,慢慢的试,总会对的。
6. 第六步:爬取内容,清洗数据
这一步完成后,我们就可以直接爬取数据了。用一个标签存储上面提炼出的像地址一样的东西。标签就会拉取到我们想获得的网页内容。
# 爬取内容
#userSkin > div.user-profile-body > div > div.user-profile-body-right > div.navList-box > div.mainContent > div > article > a > div.list-box-cont > div:nth-child(1) > div.blog-list-box-top > h4
之后我们就要soup和text过滤掉不必要的信息,比如js类语言,排除这类语言对于信息受众阅读的干扰。这样我们就成功的将信息,爬取下来了。
# 清洗数据
a = soup.select(content)
将数据存储到文件夹中,所以会有wirte带来的写的操作。想把数据保存在哪里,或者想怎么用,就看读者自己了。
# 数据存储
fo = open("./weibo_down.txt", 'a', encoding="utf-8")
for i in range(0, len(a)):
a[i] = a[i].text
fo.write(a[i] + '\n')
fo.close()
但此时如果执行程序,会发现并没有爬取到数据:
主要在于 content 内容的调整,对比后发现,虽然 copy 出来的信息,与F12中看到的html层次结构一致,但还是需要做下调整, 调整后格式如下:
爬取内容(调整前
#userSkin > div.user-profile-body > div > div.user-profile-body-right > div.navList-box > div.mainContent > div > article > a > div.list-box-cont > div:nth-child(1) > div.blog-list-box-top > h4
爬取内容(调整后)
#userSkin > div.user-profile-body > div > div.user-profile-body-right > div.navList-box > div.mainContent > div > div > div > article > a > div.list-box-cont > div:nth-child(1) > div.blog-list-box-top > h4
7. 爬取微博热搜的代码实例以及结果展示
import requests
from bs4 import BeautifulSoup
cookies = {
'uuid_tt_dd': '10_20936681940-1687695659941-897712',
'UN': 'limeigui',
'ins_first_time': '1693811332416',
'_ga': 'GA1.1.1606228358.1692240870',
'_ga_7W1N0GEY1P': 'GS1.1.1698749086.7.1.1698749172.43.0.0',
'log_Id_click': '117',
'log_Id_view': '558',
'log_Id_pv': '378',
'Hm_lvt_e5ef47b9f471504959267fd614d579cd': '1708345516',
'ssxmod_itna': 'iqGxuDBD2AKrqGHqaWxUhGQqi=Z+xeDk+Dmg04GNpUYDZDiqAPGhDC38FmBm0jwIdf4804GCi2bqxaAS77gpAIKz2mpYD74i8DCqi1D0qDY+oxBLrbQoxiiyDCmFDPrKD32xlIzDvxG=D3qDFYqDLDMNDFqG0l+QPD0Pq+mDlD73DUwdDQqDSUQKKxGjDxitRDGADx0tUD7jD2eQDeMpTcqGW0wD2zBh8YYaSR=y4cjTiP6WW5cWU7ZnCaONVemQDbRLHweXfxQ0CNODvmAvzSGPqWDhw30Gc+7xeU+1SwA/riBqeRf+3YDDGbxeA4bix4D=',
'ssxmod_itna2': 'iqGxuDBD2AKrqGHqaWxUhGQqi=Z+xeDk+DmgDA6WmhxD/Q1DFr21/4Pgp7KAPPuKOBaiGcS0MH0QPvcM+RwKb9uvohWzcgyAOYiPq2NDgQdjoj/l8LYmsENvW2Ax0MEGC2eVUWk53KncOecGsZ7RYU2PWXW0TcABidaTWCDo1XCnrdYPFIAGIzgqHPYSN48+zk+p4IAErxE0tHrOIFokWcFdKt+o87hYQ1YYzhbIsEf7d78O0u7iAADObNMGIfZE+/YBr3mnIRK3Uiqwa4IguIDw1cfH9iTGTT6qgO8KXrczLohsiEZ+2GvH2A3Z4uGvAeelAYMODZnqKfmznxYzIOnx4l3eZP7hwuAdMl6Yh=Q35NSpa=IrK6bS50OwVl5=O5Mjt72PSnrsB5u7482pFPPlWtplXa6ihBrMiPwFq7CeeaXdU6G0n2Z2KjWAhaMeRvKNNRG8jfFcqqn0OYRn8IzOQbt0O9quQD1PeXWPY1=MEpu1KjSUg2I4roD8NDL=E0ePVyofFig7Hm4DQI4zi59hpwnjtqkUCIpcYBC+0hlnlyUhQ0q4UGzKdf6o=K0YD08DiQ4YD===',
'tfstk': 'eoFMjtVQpLYXhdd7Jlh19IAf0TBKBhGjcodxDjnVLDoC5S3OC-00xlvtHR3TKxqQAlHVfc30nDqLBIhaiJO0PrNO5jQs1PGjggIRwI4_5jNZPRglwtKlqssR2_CdQelcjgetASaRYOsdBv5aaND-qUxFULPB77oz7czTWW8IQdatxIR4tJDS4imHgIPnSglyLLoTlIgFkWJXhAuI-0QMz_Nc-ETuCwbHFOMZRVIR-wvX8AuI-kQh-LTKQ2gtp',
'c_dl_um': '-',
'UserName': 'limeigui',
'UserInfo': '3b95b21938904a148617bb63e4cd8b47',
'UserToken': '3b95b21938904a148617bb63e4cd8b47',
'UserNick': 'Adunn',
'AU': '95B',
'BT': '1709514903656',
'p_uid': 'U010000',
'Hm_up_6bcd52f51e9b3dce32bec4a3997715ac': '{"islogin":{"value":"1","scope":1},"isonline":{"value":"1","scope":1},"isvip":{"value":"0","scope":1},"uid_":{"value":"limeigui","scope":1}}',
'FCNEC': '[["AKsRol-81BVROqAOt-Krga723o1zn0lKQZXCHVYOsZGp4bbSYqIORsTWdRA33h_JQeCm1pUeYYkPLifSrDfR3ebNunu-COvYV2D4sqzbPrXD_tVp9je9p1aG1qgVGpkYlxpNK3mEnUaabXB6IvFt7xBeYqz_845gCA=="]]',
'c_dl_fref': 'https://www.baidu.com/link',
'c_dl_fpage': '/download/qq_27308505/21132392',
'c_dl_prid': '1711094139943_541172',
'c_dl_rid': '1711094192991_543576',
'limeiguicomment_new': '1706325449636',
'management_ques': '1712733666636',
'c_segment': '0',
'Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac': '1712450998,1712540146,1712628949,1713335190',
'_clck': 'v3yta0|2|fl2|0|1559',
'log_Id_click': '118',
'c_pref': 'default',
'c_first_ref': 'default',
'dc_sid': '9a3d776b6aa46403c7fa216f2ec8d588',
'creative_btn_mp': '3',
'fpv': 'd7fa221d14d24b2361ad4f09bace739a',
'yd_captcha_token': 'dzp',
'dc_session_id': '10_1713523054888.848382',
'c_ref': 'default',
'log_Id_pv': '379',
'log_Id_view': '559',
'__gads': 'ID=79df0b17ce2ed235-22263755acb40040:T=1687695664:RT=1713524387:S=ALNI_MbKkmtHbLa1eh1RSXbuJOoatVdiiQ',
'__gpi': 'UID=00000c6ade25eb9f:T=1687695664:RT=1713524387:S=ALNI_MZP0oVAi-DQb_-PpFvwoGO0EYhHiQ',
'__eoi': 'ID=4a7618f393a07404:T=1706249283:RT=1713524387:S=AA-AfjbTLqbpP5gZ44TgXYWkx20B',
'_clsk': '1k295vw|1713524388493|4|0|n.clarity.ms/collect',
'SidecHatdocDescBoxNum': 'true',
'c_first_page': 'https://blog.csdn.net/BullKing8185?type=blog',
'c_dsid': '11_1713525600201.216918',
'c_page_id': 'default',
'Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac': '1713525601',
'dc_tos': 'sc6t49',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
# 'Cookie': 'uuid_tt_dd=10_20936681940-1687695659941-897712; UN=limeigui; ins_first_time=1693811332416; _ga=GA1.1.1606228358.1692240870; _ga_7W1N0GEY1P=GS1.1.1698749086.7.1.1698749172.43.0.0; log_Id_click=117; log_Id_view=558; log_Id_pv=378; Hm_lvt_e5ef47b9f471504959267fd614d579cd=1708345516; ssxmod_itna=iqGxuDBD2AKrqGHqaWxUhGQqi=Z+xeDk+Dmg04GNpUYDZDiqAPGhDC38FmBm0jwIdf4804GCi2bqxaAS77gpAIKz2mpYD74i8DCqi1D0qDY+oxBLrbQoxiiyDCmFDPrKD32xlIzDvxG=D3qDFYqDLDMNDFqG0l+QPD0Pq+mDlD73DUwdDQqDSUQKKxGjDxitRDGADx0tUD7jD2eQDeMpTcqGW0wD2zBh8YYaSR=y4cjTiP6WW5cWU7ZnCaONVemQDbRLHweXfxQ0CNODvmAvzSGPqWDhw30Gc+7xeU+1SwA/riBqeRf+3YDDGbxeA4bix4D=; ssxmod_itna2=iqGxuDBD2AKrqGHqaWxUhGQqi=Z+xeDk+DmgDA6WmhxD/Q1DFr21/4Pgp7KAPPuKOBaiGcS0MH0QPvcM+RwKb9uvohWzcgyAOYiPq2NDgQdjoj/l8LYmsENvW2Ax0MEGC2eVUWk53KncOecGsZ7RYU2PWXW0TcABidaTWCDo1XCnrdYPFIAGIzgqHPYSN48+zk+p4IAErxE0tHrOIFokWcFdKt+o87hYQ1YYzhbIsEf7d78O0u7iAADObNMGIfZE+/YBr3mnIRK3Uiqwa4IguIDw1cfH9iTGTT6qgO8KXrczLohsiEZ+2GvH2A3Z4uGvAeelAYMODZnqKfmznxYzIOnx4l3eZP7hwuAdMl6Yh=Q35NSpa=IrK6bS50OwVl5=O5Mjt72PSnrsB5u7482pFPPlWtplXa6ihBrMiPwFq7CeeaXdU6G0n2Z2KjWAhaMeRvKNNRG8jfFcqqn0OYRn8IzOQbt0O9quQD1PeXWPY1=MEpu1KjSUg2I4roD8NDL=E0ePVyofFig7Hm4DQI4zi59hpwnjtqkUCIpcYBC+0hlnlyUhQ0q4UGzKdf6o=K0YD08DiQ4YD===; tfstk=eoFMjtVQpLYXhdd7Jlh19IAf0TBKBhGjcodxDjnVLDoC5S3OC-00xlvtHR3TKxqQAlHVfc30nDqLBIhaiJO0PrNO5jQs1PGjggIRwI4_5jNZPRglwtKlqssR2_CdQelcjgetASaRYOsdBv5aaND-qUxFULPB77oz7czTWW8IQdatxIR4tJDS4imHgIPnSglyLLoTlIgFkWJXhAuI-0QMz_Nc-ETuCwbHFOMZRVIR-wvX8AuI-kQh-LTKQ2gtp; c_dl_um=-; UserName=limeigui; UserInfo=3b95b21938904a148617bb63e4cd8b47; UserToken=3b95b21938904a148617bb63e4cd8b47; UserNick=Adunn; AU=95B; BT=1709514903656; p_uid=U010000; Hm_up_6bcd52f51e9b3dce32bec4a3997715ac={"islogin":{"value":"1","scope":1},"isonline":{"value":"1","scope":1},"isvip":{"value":"0","scope":1},"uid_":{"value":"limeigui","scope":1}}; FCNEC=[["AKsRol-81BVROqAOt-Krga723o1zn0lKQZXCHVYOsZGp4bbSYqIORsTWdRA33h_JQeCm1pUeYYkPLifSrDfR3ebNunu-COvYV2D4sqzbPrXD_tVp9je9p1aG1qgVGpkYlxpNK3mEnUaabXB6IvFt7xBeYqz_845gCA=="]]; c_dl_fref=https://www.baidu.com/link; c_dl_fpage=/download/qq_27308505/21132392; c_dl_prid=1711094139943_541172; c_dl_rid=1711094192991_543576; limeiguicomment_new=1706325449636; management_ques=1712733666636; c_segment=0; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1712450998,1712540146,1712628949,1713335190; _clck=v3yta0|2|fl2|0|1559; log_Id_click=118; c_pref=default; c_first_ref=default; dc_sid=9a3d776b6aa46403c7fa216f2ec8d588; creative_btn_mp=3; fpv=d7fa221d14d24b2361ad4f09bace739a; yd_captcha_token=dzp; dc_session_id=10_1713523054888.848382; c_ref=default; log_Id_pv=379; log_Id_view=559; __gads=ID=79df0b17ce2ed235-22263755acb40040:T=1687695664:RT=1713524387:S=ALNI_MbKkmtHbLa1eh1RSXbuJOoatVdiiQ; __gpi=UID=00000c6ade25eb9f:T=1687695664:RT=1713524387:S=ALNI_MZP0oVAi-DQb_-PpFvwoGO0EYhHiQ; __eoi=ID=4a7618f393a07404:T=1706249283:RT=1713524387:S=AA-AfjbTLqbpP5gZ44TgXYWkx20B; _clsk=1k295vw|1713524388493|4|0|n.clarity.ms/collect; SidecHatdocDescBoxNum=true; c_first_page=https://blog.csdn.net/BullKing8185?type=blog; c_dsid=11_1713525600201.216918; c_page_id=default; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1713525601; dc_tos=sc6t49',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Linux"',
}
params = {
'type': 'blog',
}
response = requests.get('https://blog.csdn.net/BullKing8185', params=params, cookies=cookies, headers=headers)
response.encoding = 'utf-8'
print(f'response= {response}')
print(f'response.text= {response.text}')
soup = BeautifulSoup(response.text, 'html.parser')
print(f'soup value= {soup}')
# #userSkin > div.user-profile-body > div > div.user-profile-body-right > div.navList-box > div.mainContent > div > article:nth-child(2) > a > div.list-box-cont > div:nth-child(1) > div.blog-list-box-top > h4
# #userSkin > div.user-profile-body > div > div.user-profile-body-right > div.navList-box > div.mainContent > div > article:nth-child(3) > a > div.list-box-cont > div:nth-child(1) > div.blog-list-box-top > h4
# content = "#userSkin > div.user-profile-body > div > div.user-profile-body-right > div.navList-box > div.mainContent > div > article > a > div.list-box-cont > div:nth-child(1) > div.blog-list-box-top > h4"
content = "#userSkin > div.user-profile-body > div > div.user-profile-body-right > div.navList-box > div.mainContent > div > div > div > article > a > div.list-box-cont > div:nth-child(1) > div.blog-list-box-top > h4"
# 清洗数据
a = soup.select(content)
print(f'a value= {a}')
# 数据存储
fo = open("./cdsn_down_20240419.txt", 'a', encoding="utf-8")
for i in range(0, len(a)):
a[i] = a[i].text
fo.write(a[i] + '\n')
fo.close()
调整后运行效果如下: