仅供学习参考
一、获取html网址中文本和链接,写入TXT文件中
import requests
from lxml import html
base_url = "https://abcdef自己的网址要改"
response = requests.get(base_url)
response.encoding = 'utf-8' # 指定正确的编码方式
tree = html.fromstring(response.content, parser=html.HTMLParser(encoding='utf-8'))
# 固定部分XPath,只有最后一个div的索引会变化,自己修改,复制网址的xpath路径
fixed_xpath = "/html/body/div[4]/div[2]/ul/li[{div_index}]/a"
filename = "现TXT文本内容.txt"
with open(filename, "w", encoding="utf-8") as f:
for div_index in range(1, 100): # 假设有100个人
# 构建完整的XPath
xpath = fixed_xpath.format(div_index=div_index)
# 使用XPath定位每个人员信息的元素
person_elements = tree.xpath(xpath)
for person_element in person_elements:
# 获取网址路径和姓名信息
url_path = person_element.get("href")
full_url = base_url + url_path if url_path else ""
name = person_element.xpath('string()').strip() # 提取文本内容并去除空格
# 仅输出网址中的路径部分
url_path = full_url.replace(base_url, "")
output_str = f"网址路径:{url_path}\n姓名:{name}\n\n"
print(output_str)
f.write(output_str)
print(f"输出已保存到文件 {filename}")
结果:现TXT文本内容
网址路径:http://abc.html
姓名:abc
二、根据现有的TXT文本,打开链接找到需要的内容。将内容放入姓名之后,以新的文本输出
import re
import requests
from lxml import html
# 读取文件内容
with open("现TXT文本内容.txt", "rb") as file:
content = file.read().decode('utf-8', 'ignore')
lines = content.splitlines()
email_xpath = '/html/body/div[4]/div/div/div/div/div[2]/div[1]/div[2]/div[4]/div[1]/text()'
filename = "现TXT文本内容邮箱.txt"
with open(filename, "w", encoding="utf-8") as f:
# 遍历每一行内容
for i in range(0, len(lines), 1):
url_line = lines[i] # 当前行为URL
name_line = lines[i + 1] # 下一行为姓名
# 从URL和姓名行中提取URL和姓名信息
url_match = re.search(r"https?://[^\s]+", url_line)
name_match = re.search(r"姓名:(.+)", name_line)
# 如果URL和姓名都匹配到了
if url_match and name_match:
url = url_match.group()
name = name_match.group(1)
# 发送GET请求到URL获取页面内容
response = requests.get(url)
# 将页面内容转为XPath对象
tree = html.fromstring(response.content)
# 使用XPath表达式提取邮箱信息
email = tree.xpath(email_xpath)
email = email[0] if email else "未找到邮箱地址"
# 将姓名和邮箱信息写入文件
output_str = f"{name}:{email}\n"
print(output_str)
f.write(output_str)
# 输出保存结果
print(f"输出已保存到文件 {filename}")
输出TXT文本内容
abc:abc@aa.com
...