仅供学习参考
一、获取html网址中文本和链接,写入TXT文件中
| import requests |
| from lxml import html |
| |
| base_url = "https://abcdef自己的网址要改" |
| response = requests.get(base_url) |
| response.encoding = 'utf-8' # 指定正确的编码方式 |
| |
| tree = html.fromstring(response.content, parser=html.HTMLParser(encoding='utf-8')) |
| |
| # 固定部分XPath,只有最后一个div的索引会变化,自己修改,复制网址的xpath路径 |
| fixed_xpath = "/html/body/div[4]/div[2]/ul/li[{div_index}]/a" |
| |
| filename = "现TXT文本内容.txt" |
| |
| with open(filename, "w", encoding="utf-8") as f: |
| for div_index in range(1, 100): # 假设有100个人 |
| # 构建完整的XPath |
| xpath = fixed_xpath.format(div_index=div_index) |
| |
| # 使用XPath定位每个人员信息的元素 |
| person_elements = tree.xpath(xpath) |
| |
| for person_element in person_elements: |
| # 获取网址路径和姓名信息 |
| url_path = person_element.get("href") |
| full_url = base_url + url_path if url_path else "" |
| name = person_element.xpath('string()').strip() # 提取文本内容并去除空格 |
| |
| # 仅输出网址中的路径部分 |
| url_path = full_url.replace(base_url, "") |
| output_str = f"网址路径:{url_path}\n姓名:{name}\n\n" |
| print(output_str) |
| f.write(output_str) |
| |
| print(f"输出已保存到文件 {filename}") |
| |
复制
| 结果:现TXT文本内容 |
| |
| 网址路径:http: |
| 姓名:abc |
| |
复制
二、根据现有的TXT文本,打开链接找到需要的内容。将内容放入姓名之后,以新的文本输出
| import re |
| import requests |
| from lxml import html |
| |
| # 读取文件内容 |
| with open("现TXT文本内容.txt", "rb") as file: |
| content = file.read().decode('utf-8', 'ignore') |
| |
| lines = content.splitlines() |
| |
| email_xpath = '/html/body/div[4]/div/div/div/div/div[2]/div[1]/div[2]/div[4]/div[1]/text()' |
| |
| filename = "现TXT文本内容邮箱.txt" |
| |
| with open(filename, "w", encoding="utf-8") as f: |
| # 遍历每一行内容 |
| for i in range(0, len(lines), 1): |
| url_line = lines[i] # 当前行为URL |
| name_line = lines[i + 1] # 下一行为姓名 |
| |
| # 从URL和姓名行中提取URL和姓名信息 |
| url_match = re.search(r"https?://[^\s]+", url_line) |
| name_match = re.search(r"姓名:(.+)", name_line) |
| |
| # 如果URL和姓名都匹配到了 |
| if url_match and name_match: |
| url = url_match.group() |
| name = name_match.group(1) |
| |
| # 发送GET请求到URL获取页面内容 |
| response = requests.get(url) |
| # 将页面内容转为XPath对象 |
| tree = html.fromstring(response.content) |
| |
| # 使用XPath表达式提取邮箱信息 |
| email = tree.xpath(email_xpath) |
| email = email[0] if email else "未找到邮箱地址" |
| |
| # 将姓名和邮箱信息写入文件 |
| output_str = f"{name}:{email}\n" |
| print(output_str) |
| f.write(output_str) |
| |
| # 输出保存结果 |
| print(f"输出已保存到文件 {filename}") |
复制
| 输出TXT文本内容 |
| abc:abc@aa.com |
| ... |
复制