首页 前端知识 使用Nodejs爬取网页某个数据并把爬到的数据写入excel (服务端部分)

使用Nodejs爬取网页某个数据并把爬到的数据写入excel (服务端部分)

2024-02-01 12:02:50 前端知识 前端哥 947 520 我要收藏

结合我的上一篇文章(地址如下),请求发送过来之后服务端拿到数据之后对数据进行操作解析然后一一请求

https://blog.csdn.net/qq_45104282/article/details/127669095

详情请看代码:

有些方法用了封装方式,增加代码可读性减少代码重复性

启动服务需要node脚手架可自行安装    启动命令为  node 命名 

例如:node scrapingPage.js

//引入模块
const express=require('express'),  
urls  = require('url'),
querystring=require('querystring'),
morgan=require('morgan'),
request = require('superagent'),
fs = require('fs'),
cheerio = require('cheerio'),
excelJS = require('exceljs')

let list,data = []   //存储数据

const app=express()

let accessLogStream = fs.createWriteStream('./access.log', {flags: 'a'});  //日志请求时间

app.use(morgan('short',{stream:accessLogStream}));

app.all('/scrapingPage',(request,response)=>{

    let pathName=urls.parse(request.url).pathname;

    if(pathName=='/scrapingPage'){
        let names=''
        request.on('data',(chunk)=>{
            names+=chunk
        });
        request.on('end',()=>{
            let params=querystring.parse(names);
            var str=params.code;
            // 输入请求间隔时间(秒)
            var intervalTimeValue=params.time
            str=JSON.parse(str);
            let strLen = str.length
            console.log('---------------链接',str)
            resultData(str,intervalTimeValue)
            response.end()
        })
    }
})

app.listen(8888,()=>{
    console.log('服务器已启动,监听端口中')
})

function resultData(urlArr,intervalTimeValue) {

      for (let index = 0; index  < urlArr.length; index ++) {
          setTimeout(() => {
              capturePageResult(urlArr[index])
          }, i * intervalTimeValue * 1000)
      }

}

function capturePageResult(urlArr) {
    request.get(urlArr.toString())
        .end((err, res) => {
            if (!err) {    
                let
                    html = res.text,  
                    $ = cheerio.load(html, {decodeEntities: false}),    // 加载获取到的 html 数据
                    $itemMod = $('#productDetails_detailBullets_sections1'),
                    $itemMods = $('#detailBulletsWrapper_feature_div > ul > li'),
                    tableLen = $itemMod.length,
                    spanLen = $itemMods.length,
                    rankStr;
                console.log('tableLen长度', tableLen)
                console.log('spanLen长度', spanLen)
                if (tableLen > 0) {
                    // console.log(urlArr[i]+'join')
                    $itemMod.each((i, e) => {
                        $e = $(e);  // 缓存

                        rankStr = $e.find('span').text().split("\n")[0]
                        if($e.find('span').text().split("ratings").slice(-1).toString() === ""){
                            if(rankStr.split("#").length >= 6){
                                let rankStrArr = noRepeat(rankStr.split("#"))
                                let rankStrSave = ''
                                for (let j = 0; j < rankStrArr.length; j++) {
                                    rankStrSave =rankStrArr[0] + '\n#' + rankStrArr[1] +'\n#'+ rankStrArr[2] +'\n#'+ rankStrArr[3] +'\n#'+ rankStrArr[4]
                                }
                                readXlsxFile(urlArr, rankStrSave, formattingDate())
                            }
                            //  console.log(rankStr.split("#"))
                        }

                        if($e.find('span').text().split("ratings").slice(-1).toString() !== ""){

                            data.push($e.find('span').text().split("ratings").slice(-1).toString());

                            var rankNewStr = noRepeat(data.toString().split('#'))
                            console.log('rankNewStr',rankNewStr)
                            let rankData = ''
                            for (let j = 0; j < rankNewStr.length; j++) {
                                if (rankNewStr.length >= 3) rankData = '#' + rankNewStr[1] + '\n#' + rankNewStr[2] + (rankNewStr[3] === undefined ? '' : '\n#' + rankNewStr[3])
                                if (rankNewStr.length === 2) rankData = '#' + rankNewStr[1]
                            }
                            console.log('rankData',rankData)
                            // list = data.slice(-1).toString();
                            readXlsxFile(urlArr, rankData, formattingDate())

                            /**
                             * 去空格
                             */
                            // var str1 = str.replace(/#{1}\s*/g,"\n#");
                        }
                    });

                    //清空数组
                    data.length = []
                    // data.pop()
                }

                if (spanLen === 0 && tableLen === 0){
                    console.log(urlArr + '没有商品排名')
                    readXlsxFile(urlArr,"Not Data",formattingDate())
                }

                if(spanLen > 0){
                    $itemMods.each((i, e) => {
                        $e = $(e);  // 缓存

                        data.push($e.find('span').text().trim())

                    });
                    let removeBest = data.toString().split("Customer Reviews:")[0].toString()

                    console.log(removeBest.split("Best Sellers Rank:  ")[1])

                    if (removeBest.split("Best Sellers Rank:  ")[1] === undefined){
                        console.log(urlArr + '没有商品排名')
                        readXlsxFile(urlArr,"The commodity doesn't rank(该商品没有排名)",formattingDate())
                    }else {
                        readXlsxFile(urlArr,removeBest.split("Best Sellers Rank:  ")[1].replace(/#{1}\s*/g,"\n#"),formattingDate())
                    }
                    data.length = []
                }

            } else {
                console.log(urlArr + '请求延时稍后再试');
                readXlsxFile(urlArr,"The request timeout please try again later(请求延时)",formattingDate())
                return;
            }
        });
}

/**
 * 去重
 * @param arr
 * @returns {*[]}
 */
function noRepeat(arr) {
    let newArr=[];
    for(let i=0; i<arr.length; i++) {

        if(newArr.indexOf(arr[i].toString().replace(/^\s*|\s*$/g,"")) === -1) {  
          // 判断数组中有没有字符串值,如果没有则返回 -1
            newArr.push(arr[i].toString().replace(/^\s*|\s*$/g,""));
        }
    }
    return newArr
}

/**
 * 格式化时间
 * @returns {string}
 */
function formattingDate() {
    let formatDate = new Date()
    let fullYear = formatDate.getFullYear()
    let month = formatDate.getMonth() + 1
    let dates = formatDate.getDate()
    let hours = formatDate.getHours()
    let minutes = formatDate.getMinutes() < 10 ? '0'+formatDate.getMinutes() : formatDate.getMinutes()
    let seconds = formatDate.getSeconds() < 10 ? '0'+formatDate.getSeconds() : formatDate.getSeconds()
    let dateGroup = fullYear+"/"+month+"/"+dates+" "+hours+":"+minutes+":"+seconds
    return dateGroup
}

/**
 * 读写excel
 * @param code
 * @param name
 * @param date
 */
function readXlsxFile(code,name,date) {

    // let file = './'+code.toString().split("/")[4]+'.xlsx'
    // let files = './'+code.toString().split("/")[4]+'.xlsx'

    let readFileData = []
    readFileData.push([code,name,date])
    console.log(readFileData)

    let workbook = new excelJS.Workbook()

    let files = './readFile.xlsx'

    fs.readFile('./readFile.xlsx', 'utf-8', async function (err) {
        if (err) {

            // 表标签名
            let sheet = workbook.addWorksheet('商品排名统计表', {views: [{state: 'frozen', ySplit: 1}]})
            //初始化数据 跟绑定
            sheet.addRow(['编号', '商品排名详细信息', '获取时间']).height = 30
            sheet.columns = [
                {header: '编号', key: readFileData[0], width:40},
                {header: '商品排名详细信息', key: readFileData[1], width: 130},
                {header: '获取时间', key: readFileData[2], width: 20}
            ]
            // 筛选跟设置单元格格式
            sheet.autoFilter = 'A1:C1'
            sheet.getCell('A1').alignment = {vertical: 'middle', horizontal: 'center'};
            sheet.getCell('B1').alignment = {vertical: 'middle', horizontal: 'center'};
            sheet.getCell('C1').alignment = {vertical: 'middle', horizontal: 'center'}

            for (let i = 0; i < readFileData.length; i++) {
                sheet.addRow(readFileData[i]).height = 100
                sheet.getCell('A2').alignment = {vertical: 'middle', horizontal: 'center'};
                sheet.getCell('B2').alignment = {vertical: 'middle', horizontal: 'center'};
                sheet.getCell('C2').alignment = {vertical: 'middle', horizontal: 'center'}
            }
            //清空数组
             readFileData.length = []
            //写入
            workbook.xlsx.writeFile(files)
                    .then(function () {
                        console.log(`${code}的内容已保存到${files.substring(2,10)}文件`)
                    });
        } else {

            try{
                /**
                 * 读取信息
                 */
                let sheet = await workbook.xlsx.readFile(files)

                let sheetL = []
                sheet.eachSheet(function (sheet, sheetId) {
                    //打印当前表的名字(标签名)
                    sheetL.push(sheet.name)
                })
                let length = sheetL.length
                let s = sheet.getWorksheet(length)
                let sheetLine = s.lastRow.number
                console.log('sheetLine=====>', sheetLine)

                //  如果行数据大于20000 那就再新建一个工作表
                if (sheetLine > 20000) {
                    sheet.addWorksheet(`商品排名统计表${sheetL.length + 1}`, {views: [{state: 'frozen', ySplit: 1}]})
                    s = sheet.getWorksheet(`商品排名统计表${sheetL.length + 1}`)
                    s.addRow(['编号', '商品排名详细信息', '获取时间']).height = 30
                }

                for (let i = 0; i < readFileData.length; i++) {
                    //把后续进来的每一个单元格设置高度以及居中格式
                    s.addRow(readFileData[i]).height = 100
                    s.getCell(`A${sheetLine+1}`).alignment = {vertical: 'middle', horizontal: 'center'};
                    s.getCell(`B${sheetLine+1}`).alignment = {vertical: 'middle', horizontal: 'center'};
                    s.getCell(`C${sheetLine+1}`).alignment = {vertical: 'middle', horizontal: 'center'};
                }
                 //清空数组
                 readFileData.length = []

                //写入
                sheet.xlsx.writeFile(files)
                    .then(function () {
                        console.log(`${code}的内容已保存到${files.substring(2,10)}文件(2号入口)`)
                    });
            }catch (e) {
                // console.log('error==========>',e)
                console.log(`${code}失效  请重新获取`)
            }
        }
    })
}


服务端启动之后,结合我的上一篇文章,前端提交了四个链接跟五秒间隔,看是否写进成功

 

爬到的数据写入成功,且间隔是每五秒请求一次。overover~~~

如果有不懂的欢迎在评论区留言,我会一一解答的,或者私聊也可。下期见!!

转载请注明出处或者链接地址:https://www.qianduange.cn//article/1055.html
评论
发布的文章

Ajax用法总结

2024-02-14 09:02:07

JQuery之jsTree树形插件

2024-02-14 09:02:01

Why React Doesn‘t Need jQuery?

2024-02-14 09:02:00

jQuery模板字符串

2024-02-14 09:02:58

大家推荐的文章
会员中心 联系我 留言建议 回顶部
复制成功!