【头歌】——数据分析与实践-python-网络爬虫-Scrapy爬虫基础-网页数据解析-requests 爬虫-JSON基础

- Pandas 初体验
- - 第1关爬取网页的表格信息
  - 第2关爬取表格中指定单元格的信息
  - 第3关将单元格的信息保存到列表并排序
  - 第4关爬取div标签的信息
  - 第5关爬取单页多个div标签的信息
  - 第6关爬取多个网页的多个div标签的信息
- Scrapy爬虫基础
- - 第1关 Scarpy安装与项目创建
  - 第2关 Scrapy核心原理
- 网页数据解析
- - 第1关 XPath解析网页
  - 第2关 BeautifulSoup解析网页
- requests 爬虫
- - 第1关 requests 基础
  - 第2关 requests 进阶
- JSON基础
- - 第1关 JSON篇：JSON基础知识
  - 第2关 JSON篇：使用json库

Pandas 初体验

第1关爬取网页的表格信息

import requests
from bs4 import BeautifulSoup
#代码开始
respose = requests.get("https://tjj.hunan.gov.cn/hntj/tjfx/tjgb/pcgbv/202105/t20210519_19079329.html")
respose.encoding = 'utf-8'
content = respose.text.encode()
soup = BeautifulSoup(content, "html.parser")
bg = soup.find('table')
#代码结束
print(bg)

第2关爬取表格中指定单元格的信息

import requests
from bs4 import BeautifulSoup
url = "https://tjj.hunan.gov.cn/hntj/tjfx/tjgb/pcgbv/202105/t20210519_19079329.html"
r=requests.get(url)
r.encoding = 'utf-8' 
soup=BeautifulSoup(r.text,"html.parser")
bg=soup.find('table')
#代码开始
alltr = bg.findAll('tr')
for index, i in enumerate(alltr, 1):  # 使用enumerate获取索引
    if index >= 4:  # 从第四行开始输出
        allspan = i.findAll('span')
        for count, j in enumerate(allspan,1):
                print(j.text,end=" ")
        print()  # 在第二个循环结束后换行
#代码结束

第3关将单元格的信息保存到列表并排序

import requests
from bs4 import BeautifulSoup
url = "https://tjj.hunan.gov.cn/hntj/tjfx/tjgb/pcgbv/202105/t20210519_19079329.html"
r=requests.get(url)
r.encoding = 'utf-8' 
soup=BeautifulSoup(r.text,"html.parser")
bg=soup.find('table')
lb=[]
#代码开始
name_num = {}
use = []
alltr = bg.findAll('tr')
for index, i in enumerate(alltr, 1):  # 使用enumerate获取索引
    if index >= 4:  # 从第四行开始输出
        allspan = i.findAll('span')
        name = allspan[0].text
        num = allspan[1].text
        name_num[name] = int(num)  
        use.append(int(num))
use.sort(reverse=True)
 
lb = [ [k,v] for k,v in sorted(name_num.items(),key=lambda item: use.index(item[1]))]
 
 
#代码结束
for lbxx in lb:
    print(lbxx[0],lbxx[1])

第4关爬取div标签的信息

import requests
from bs4 import BeautifulSoup
url = 'https://www.hnu.edu.cn/xysh/xshd.htm'
r = requests.get(url)
r.encoding = 'utf-8'
#代码开始
soup = BeautifulSoup(r.text,'html.parser')
jzsj = soup.find('div',class_= 'xinwen-sj-top').string.strip()
jzbt = soup.find('div',attrs={'class','xinwen-wen-bt'}).string.strip()
jzdd = soup.find('div',attrs={'class','xinwen-wen-zy'}).text.strip()
#代码结束
f1=open("jzxx.txt","w")
f1.write(jzsj+"\n")
f1.write(jzbt+"\n")
f1.write(jzdd+"\n")
f1.close()

第5关爬取单页多个div标签的信息

import requests
from bs4 import BeautifulSoup
url = 'https://www.hnu.edu.cn/xysh/xshd.htm'
r = requests.get(url)
r.encoding = 'utf-8'
jzxx=[]
#代码开始
#代码结束
f1=open("jzxx2.txt","w")
for xx in jzxx:
    f1.write(",".join(xx)+"\n")
f1.close()

第6关爬取多个网页的多个div标签的信息

#湖南大学信科院陈娟版权所有
import requests
from bs4 import BeautifulSoup
f1=open("jz.txt","w",encoding="utf8")
#代码开始
#代码结束
f1.close()

Scrapy爬虫基础

第1关 Scarpy安装与项目创建

#include <iostream>
using namespace std ;
int main (){
    int x ; cin >> x ; 
    while(x--){
     scrapy genspider Hello www.educoder.net   
    }
    return 0 ; 
}

第2关 Scrapy核心原理

# -*- coding: utf-8 -*-
import scrapy

class WorldSpider(scrapy.Spider):
    name = 'world'
    allowed_domains = ['www.baidu.com']
    start_urls = ['http://www.baidu.com/']

    def parse(self, response):
        # ********** Begin *********#
        # 将获取网页源码本地持久化
        baidu = response.url.split(".")[1] + '.html'
        with open(baidu, 'wb') as f:
            f.write(response.body)
        # ********** End *********#

网页数据解析

第1关 XPath解析网页

import urllib.request

from lxml import etree

def get_data(url):
    '''
    :param url: 请求地址
    :return: None
    '''
    response=urllib.request.urlopen(url=url)
    html=response.read().decode("utf-8")
    # *************** Begin *************** #
    parse= etree.HTML(html)
    item_list = parse.xpath("//div[@class='left']/ul/li/span/a/text()")

    # *************** End ***************** #

    print(item_list)

第2关 BeautifulSoup解析网页

import requests
from bs4 import BeautifulSoup

def get_data(url, headers):
    '''
    两个参数
    :param url:统一资源定位符,请求网址
    :param headers:请求头
    :return data:list类型的所有古诗内容
    '''
    # ***************** Begin ******************** #
    obj=requests.get(url)
    soup=BeautifulSoup(obj.content,"lxml",from_encoding="utf-8")
    ##data=soup.find("div",class_="left").find('p')
    data=soup.find("div",class_='left').ul.find_all("li")
    data = [i.p.text for i in data]

    
    # ****************** end ********************* #
    return data

requests 爬虫

第1关 requests 基础

import requests


def get_html(url):
    '''
    两个参数
    :param url:统一资源定位符,请求网址
    :param headers:请求头
    :return:html
    '''
    
    # ***************** Begin ******************** #

    # 补充请求头
    headers={}
    # get请求网页
    header={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }

    res= requests.get(url ,headers=header)
    res.encoding = 'utf-8'
    html=res.text



    


    
    # 获取网页信息文本
    
    # ***************** End ******************** #
    return html

第2关 requests 进阶

import requests


def get_html(url):
    '''
    两个参数
    :param url:统一资源定位符,请求网址
    :param headers:请求头
    :return html 网页的源码
    :return sess 创建的会话
    '''
    
    # ***************** Begin ******************** #
    
    # 补充请求头
    headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"

    }
    # 创建Session, 并使用Session的get请求网页
    sess = requests.session()
    data = {
        "name":"hblgysl",
        "password":"hblgzsx",
    }
    res = sess.post(url,headers=headers,data=data)
    res1 = sess.get(url)
    html=res1.text

    
    # 获取网页信息文本
    
    # ****************** End ********************* #
    return html, sess

JSON基础

第1关 JSON篇：JSON基础知识

{
  "students": [
    { "name": "赵昊", "age": 15, "ismale": true },
    { "name": "龙傲天", "age": 16, "ismale": true },
    { "name": "玛丽苏", "age": 15, "ismale": false }
  ],
  "count": 3
}

第2关 JSON篇：使用json库

import json
 
def Func():
    data = open("step2/2017.txt","r",encoding = "utf-8")
 
    obj = json.load(data)
 
    data.close()
    
    #********** Begin *********#
    obj={
    "count":4, 
    "infos":
        [
            {"name":"赵昊" , "age":16 ,"height": 1.83, "sex" : "男性" },
            {"name":"龙傲天" , "age":17 ,"height": 2.00, "sex" : "男性"},
            {"name":"玛丽苏" , "age":16 ,"height": 1.78, "sex" : "女性"},
            {"name":"叶良辰" , "age":17 ,"height": 1.87, "sex" : "男性"}
        ]
    }
    #********** End **********#
    output = open("step2/2018.txt","w",encoding = "utf-8")
    json.dump(obj,output) #输出到文件
    output.close()

【头歌】——数据分析与实践-python-网络爬虫-Scrapy爬虫基础-网页数据解析-requests 爬虫-JSON基础

【头歌】——数据分析与实践-python-网络爬虫-Scrapy爬虫基础-网页数据解析-requests 爬虫-JSON基础

Pandas 初体验

第1关 爬取网页的表格信息

第2关 爬取表格中指定单元格的信息

第3关 将单元格的信息保存到列表并排序

第4关 爬取div标签的信息

第5关 爬取单页多个div标签的信息

第6关 爬取多个网页的多个div标签的信息

Scrapy爬虫基础

第1关 Scarpy安装与项目创建

第2关 Scrapy核心原理

网页数据解析

第1关 XPath解析网页

第2关 BeautifulSoup解析网页

requests 爬虫

第1关 requests 基础

第2关 requests 进阶

JSON基础

第1关 JSON篇：JSON基础知识

第2关 JSON篇：使用json库

第1关爬取网页的表格信息

第2关爬取表格中指定单元格的信息

第3关将单元格的信息保存到列表并排序

第4关爬取div标签的信息

第5关爬取单页多个div标签的信息

第6关爬取多个网页的多个div标签的信息