【渗透-信息收集】JSFinder脚本爬取网站js数据的使用

1.介绍：

JSFinder 是一个用于探测网站中 JavaScript 文件的工具，它可以帮助安全研究人员发现网站加载的 JavaScript 文件中包含的网址、子域名、API 端点等信息。这种类型的工具通常用于信息收集阶段，是网络安全和渗透测试领域中的常用工具之一。

2.下载和使用

2.1 下载JSFinder脚本

链接： https://github.com/Threezh1/JSFinder
在这里插入图片描述

在这里插入图片描述
注意： 由于此脚本由python进行编写，所以预先要有python环境，这里的python安装本教程略过。

2.2 使用

在这里插入图片描述
打开cmd ,并将路径改为脚本所在路径。

输入命令：python JSFinder.py -u https://www.csdn.net/
运行结果如下：
在这里插入图片描述

3.脚本源码

 #!/usr/bin/env python"
# coding: utf-8
# By Threezh1
# https://threezh1.github.io/
 
import requests, argparse, sys, re
from requests.packages import urllib3
from urllib.parse import urlparse
from bs4 import BeautifulSoup
 
def parse_args():
    parser = argparse.ArgumentParser(epilog='\tExample: \r\npython ' + sys.argv[0] + " -u http://www.baidu.com")
    parser.add_argument("-u", "--url", help="The website")
    parser.add_argument("-c", "--cookie", help="The website cookie")
    parser.add_argument("-f", "--file", help="The file contains url or js")
    parser.add_argument("-ou", "--outputurl", help="Output file name. ")
    parser.add_argument("-os", "--outputsubdomain", help="Output file name. ")
    parser.add_argument("-j", "--js", help="Find in js file", action="store_true")
    parser.add_argument("-d", "--deep",help="Deep find", action="store_true")
    return parser.parse_args()
 
# Regular expression comes from https://github.com/GerbenJavado/LinkFinder
def extract_URL(JS):
	pattern_raw = r"""
	  (?:"|')                               # Start newline delimiter
	  (
	    ((?:[a-zA-Z]{1,10}://|//)           # Match a scheme [a-Z]*1-10 or //
	    [^"'/]{1,}\.                        # Match a domainname (any character + dot)
	    [a-zA-Z]{2,}[^"']{0,})              # The domainextension and/or path
	    |
	    ((?:/|\.\./|\./)                    # Start with /,../,./
	    [^"'><,;| *()(%%$^/\\\[\]]          # Next character can't be...
	    [^"'><,;|()]{1,})                   # Rest of the characters can't be
	    |
	    ([a-zA-Z0-9_\-/]{1,}/               # Relative endpoint with /
	    [a-zA-Z0-9_\-/]{1,}                 # Resource name
	    \.(?:[a-zA-Z]{1,4}|action)          # Rest + extension (length 1-4 or action)
	    (?:[\?|/][^"|']{0,}|))              # ? mark with parameters
	    |
	    ([a-zA-Z0-9_\-]{1,}                 # filename
	    \.(?:php|asp|aspx|jsp|json|
	         action|html|js|txt|xml)             # . + extension
	    (?:\?[^"|']{0,}|))                  # ? mark with parameters
	  )
	  (?:"|')                               # End newline delimiter
	"""
	pattern = re.compile(pattern_raw, re.VERBOSE)
	result = re.finditer(pattern, str(JS))
	if result == None:
		return None
	js_url = []
	return [match.group().strip('"').strip("'") for match in result
		if match.group() not in js_url]
 
# Get the page source
def Extract_html(URL):
	header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36",
	"Cookie": args.cookie}
	try:
		raw = requests.get(URL, headers = header, timeout=3, verify=False)
		raw = raw.content.decode("utf-8", "ignore")
		return raw
	except:
		return None
 
# Handling relative URLs
def process_url(URL, re_URL):
	black_url = ["javascript:"]	# Add some keyword for filter url.
	URL_raw = urlparse(URL)
	ab_URL = URL_raw.netloc
	host_URL = URL_raw.scheme
	if re_URL[0:2] == "//":
		result = host_URL  + ":" + re_URL
	elif re_URL[0:4] == "http":
		result = re_URL
	elif re_URL[0:2] != "//" and re_URL not in black_url:
		if re_URL[0:1] == "/":
			result = host_URL + "://" + ab_URL + re_URL
		else:
			if re_URL[0:1] == ".":
				if re_URL[0:2] == "..":
					result = host_URL + "://" + ab_URL + re_URL[2:]
				else:
					result = host_URL + "://" + ab_URL + re_URL[1:]
			else:
				result = host_URL + "://" + ab_URL + "/" + re_URL
	else:
		result = URL
	return result
 
def find_last(string,str):
	positions = []
	last_position=-1
	while True:
		position = string.find(str,last_position+1)
		if position == -1:break
		last_position = position
		positions.append(position)
	return positions
 
def find_by_url(url, js = False):
	if js == False:
		try:
			print("url:" + url)
		except:
			print("Please specify a URL like https://www.baidu.com")
		html_raw = Extract_html(url)
		if html_raw == None: 
			print("Fail to access " + url)
			return None
		#print(html_raw)
		html = BeautifulSoup(html_raw, "html.parser")
		html_scripts = html.findAll("script")
		script_array = {}
		script_temp = ""
		for html_script in html_scripts:
			script_src = html_script.get("src")
			if script_src == None:
				script_temp += html_script.get_text() + "\n"
			else:
				purl = process_url(url, script_src)
				script_array[purl] = Extract_html(purl)
		script_array[url] = script_temp
		allurls = []
		for script in script_array:
			#print(script)
			temp_urls = extract_URL(script_array[script])
			if len(temp_urls) == 0: continue
			for temp_url in temp_urls:
				allurls.append(process_url(script, temp_url)) 
		result = []
		for singerurl in allurls:
			url_raw = urlparse(url)
			domain = url_raw.netloc
			positions = find_last(domain, ".")
			miandomain = domain
			if len(positions) > 1:miandomain = domain[positions[-2] + 1:]
			#print(miandomain)
			suburl = urlparse(singerurl)
			subdomain = suburl.netloc
			#print(singerurl)
			if miandomain in subdomain or subdomain.strip() == "":
				if singerurl.strip() not in result:
					result.append(singerurl)
		return result
	return sorted(set(extract_URL(Extract_html(url)))) or None
 
 
def find_subdomain(urls, mainurl):
	url_raw = urlparse(mainurl)
	domain = url_raw.netloc
	miandomain = domain
	positions = find_last(domain, ".")
	if len(positions) > 1:miandomain = domain[positions[-2] + 1:]
	subdomains = []
	for url in urls:
		suburl = urlparse(url)
		subdomain = suburl.netloc
		#print(subdomain)
		if subdomain.strip() == "": continue
		if miandomain in subdomain:
			if subdomain not in subdomains:
				subdomains.append(subdomain)
	return subdomains
 
def find_by_url_deep(url):
	html_raw = Extract_html(url)
	if html_raw == None: 
		print("Fail to access " + url)
		return None
	html = BeautifulSoup(html_raw, "html.parser")
	html_as = html.findAll("a")
	links = []
	for html_a in html_as:
		src = html_a.get("href")
		if src == "" or src == None: continue
		link = process_url(url, src)
		if link not in links:
			links.append(link)
	if links == []: return None
	print("ALL Find " + str(len(links)) + " links")
	urls = []
	i = len(links)
	for link in links:
		temp_urls = find_by_url(link)
		if temp_urls == None: continue
		print("Remaining " + str(i) + " | Find " + str(len(temp_urls)) + " URL in " + link)
		for temp_url in temp_urls:
			if temp_url not in urls:
				urls.append(temp_url)
		i -= 1
	return urls
 
	
def find_by_file(file_path, js=False):
	with open(file_path, "r") as fobject:
		links = fobject.read().split("\n")
	if links == []: return None
	print("ALL Find " + str(len(links)) + " links")
	urls = []
	i = len(links)
	for link in links:
		if js == False:
			temp_urls = find_by_url(link)
		else:
			temp_urls = find_by_url(link, js=True)
		if temp_urls == None: continue
		print(str(i) + " Find " + str(len(temp_urls)) + " URL in " + link)
		for temp_url in temp_urls:
			if temp_url not in urls:
				urls.append(temp_url)
		i -= 1
	return urls
 
def giveresult(urls, domian):
	if urls == None:
		return None
	print("Find " + str(len(urls)) + " URL:")
	content_url = ""
	content_subdomain = ""
	for url in urls:
		content_url += url + "\n"
		print(url)
	subdomains = find_subdomain(urls, domian)
	print("\nFind " + str(len(subdomains)) + " Subdomain:")
	for subdomain in subdomains:
		content_subdomain += subdomain + "\n"
		print(subdomain)
	if args.outputurl != None:
		with open(args.outputurl, "a", encoding='utf-8') as fobject:
			fobject.write(content_url)
		print("\nOutput " + str(len(urls)) + " urls")
		print("Path:" + args.outputurl)
	if args.outputsubdomain != None:
		with open(args.outputsubdomain, "a", encoding='utf-8') as fobject:
			fobject.write(content_subdomain)
		print("\nOutput " + str(len(subdomains)) + " subdomains")
		print("Path:" + args.outputsubdomain)
 
if __name__ == "__main__":
	urllib3.disable_warnings()
	args = parse_args()
	if args.file == None:
		if args.deep is not True:
			urls = find_by_url(args.url)
			giveresult(urls, args.url)
		else:
			urls = find_by_url_deep(args.url)
			giveresult(urls, args.url)
	else:
		if args.js is not True:
			urls = find_by_file(args.file)
			giveresult(urls, urls[0])
		else:
			urls = find_by_file(args.file, js = True)
			giveresult(urls, urls[0])
 复制

【渗透-信息收集】JSFinder脚本爬取网站js数据的使用

1.介绍：

2.下载和使用

2.1 下载JSFinder脚本

2.2 使用

3.脚本源码

前端提高篇（102）：jQuery高级方法callbacks、deferred

html 无序标签有序标签及表单

CSS3新增属性(15个案例代码效果图素材)

基于CSS3媒体查询的响应式旅游网站设计与实现-计算机毕设附源码 12755

python requests编写 api接收json

vue3中getCurrentInstance不推荐使用以及在＜script setup＞中获取全局内容(三种方式)

vue选项式和组合式区别

每天10个vue面试题(七)

Vue 2 中的 `$set` 方法详解

Jquery的常用方法

前端哥

C#解析JSON的常用库--Newtonsoft.Json

jsonfield 项目常见问题解决方案

【SpringMVC】_SpringMVC项目返回HTML与JSON

BugJson因为json格式问题OOM怎么办

python 解读JSON文件，一文搞懂！

Redisson同时使用jackson、fastjson、kryo、protostuff序列化（含效率对比）

开源项目“Pretty JSON”安装与配置完全指南

2024年前端最新Nodejs基础之包管理工具npm(二)(2)，微软面试题及答案

解决全局安装pnpm后无法使用的问题

安装Nodejs后，npm无法使用

1
【Echarts系列】—— 实现电池图、3D立体圆形柱状图

2024-03-03 11:03:011001

2
CSS常用属性（文本属性）

2024-11-04 09:11:111000

3
TypeScript 中的 Number 类型，Number 类型的特性、常见操作和注意事项

2024-09-30 23:09:061000

4
CSS写代码使页面划分为左右两个区域

2024-09-09 00:09:071000

5
vue使用datav echarts

2024-09-06 00:09:381000

6
使用TweenMax.js和CSS3创建冰球运动员动画效果教程

2024-09-04 23:09:411000

7
使用CDN提高jQuery加载速度

2024-08-24 23:08:211000

8
小兔鲜儿网页首页制作黑马程序员前端基础项目自学笔记

2024-08-19 22:08:161000

9
《Vue》你的弹窗能拖动吗？Vue自定义指令实现可拖动弹窗

2024-08-19 22:08:121000

10
npm的使用

2024-08-18 00:08:131000

	#!/usr/bin/env python"
	# coding: utf-8
	# By Threezh1
	# https://threezh1.github.io/

	import requests, argparse, sys, re
	from requests.packages import urllib3
	from urllib.parse import urlparse
	from bs4 import BeautifulSoup

	def parse_args():
	parser = argparse.ArgumentParser(epilog='\tExample: \r\npython ' + sys.argv[0] + " -u http://www.baidu.com")
	parser.add_argument("-u", "--url", help="The website")
	parser.add_argument("-c", "--cookie", help="The website cookie")
	parser.add_argument("-f", "--file", help="The file contains url or js")
	parser.add_argument("-ou", "--outputurl", help="Output file name. ")
	parser.add_argument("-os", "--outputsubdomain", help="Output file name. ")
	parser.add_argument("-j", "--js", help="Find in js file", action="store_true")
	parser.add_argument("-d", "--deep",help="Deep find", action="store_true")
	return parser.parse_args()

	# Regular expression comes from https://github.com/GerbenJavado/LinkFinder
	def extract_URL(JS):
	pattern_raw = r"""
	(?:"\|') # Start newline delimiter
	(
	((?:[a-zA-Z]{1,10}://\|//) # Match a scheme [a-Z]*1-10 or //
	[^"'/]{1,}\. # Match a domainname (any character + dot)
	[a-zA-Z]{2,}[^"']{0,}) # The domainextension and/or path
	\|
	((?:/\|\.\./\|\./) # Start with /,../,./
	[^"'><,;\| *()(%%$^/\\\[\]] # Next character can't be...
	[^"'><,;\|()]{1,}) # Rest of the characters can't be
	\|
	([a-zA-Z0-9_\-/]{1,}/ # Relative endpoint with /
	[a-zA-Z0-9_\-/]{1,} # Resource name
	\.(?:[a-zA-Z]{1,4}\|action) # Rest + extension (length 1-4 or action)
	(?:[\?\|/][^"\|']{0,}\|)) # ? mark with parameters
	\|
	([a-zA-Z0-9_\-]{1,} # filename
	\.(?:php\|asp\|aspx\|jsp\|json\|
	action\|html\|js\|txt\|xml) # . + extension
	(?:\?[^"\|']{0,}\|)) # ? mark with parameters
	)
	(?:"\|') # End newline delimiter
	"""
	pattern = re.compile(pattern_raw, re.VERBOSE)
	result = re.finditer(pattern, str(JS))
	if result == None:
	return None
	js_url = []
	return [match.group().strip('"').strip("'") for match in result
	if match.group() not in js_url]

	# Get the page source
	def Extract_html(URL):
	header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36",
	"Cookie": args.cookie}
	try:
	raw = requests.get(URL, headers = header, timeout=3, verify=False)
	raw = raw.content.decode("utf-8", "ignore")
	return raw
	except:
	return None

	# Handling relative URLs
	def process_url(URL, re_URL):
	black_url = ["javascript:"] # Add some keyword for filter url.
	URL_raw = urlparse(URL)
	ab_URL = URL_raw.netloc
	host_URL = URL_raw.scheme
	if re_URL[0:2] == "//":
	result = host_URL + ":" + re_URL
	elif re_URL[0:4] == "http":
	result = re_URL
	elif re_URL[0:2] != "//" and re_URL not in black_url:
	if re_URL[0:1] == "/":
	result = host_URL + "://" + ab_URL + re_URL
	else:
	if re_URL[0:1] == ".":
	if re_URL[0:2] == "..":
	result = host_URL + "://" + ab_URL + re_URL[2:]
	else:
	result = host_URL + "://" + ab_URL + re_URL[1:]
	else:
	result = host_URL + "://" + ab_URL + "/" + re_URL
	else:
	result = URL
	return result

	def find_last(string,str):
	positions = []
	last_position=-1
	while True:
	position = string.find(str,last_position+1)
	if position == -1:break
	last_position = position
	positions.append(position)
	return positions

	def find_by_url(url, js = False):
	if js == False:
	try:
	print("url:" + url)
	except:
	print("Please specify a URL like https://www.baidu.com")
	html_raw = Extract_html(url)
	if html_raw == None:
	print("Fail to access " + url)
	return None
	#print(html_raw)
	html = BeautifulSoup(html_raw, "html.parser")
	html_scripts = html.findAll("script")
	script_array = {}
	script_temp = ""
	for html_script in html_scripts:
	script_src = html_script.get("src")
	if script_src == None:
	script_temp += html_script.get_text() + "\n"
	else:
	purl = process_url(url, script_src)
	script_array[purl] = Extract_html(purl)
	script_array[url] = script_temp
	allurls = []
	for script in script_array:
	#print(script)
	temp_urls = extract_URL(script_array[script])
	if len(temp_urls) == 0: continue
	for temp_url in temp_urls:
	allurls.append(process_url(script, temp_url))
	result = []
	for singerurl in allurls:
	url_raw = urlparse(url)
	domain = url_raw.netloc
	positions = find_last(domain, ".")
	miandomain = domain
	if len(positions) > 1:miandomain = domain[positions[-2] + 1:]
	#print(miandomain)
	suburl = urlparse(singerurl)
	subdomain = suburl.netloc
	#print(singerurl)
	if miandomain in subdomain or subdomain.strip() == "":
	if singerurl.strip() not in result:
	result.append(singerurl)
	return result
	return sorted(set(extract_URL(Extract_html(url)))) or None


	def find_subdomain(urls, mainurl):
	url_raw = urlparse(mainurl)
	domain = url_raw.netloc
	miandomain = domain
	positions = find_last(domain, ".")
	if len(positions) > 1:miandomain = domain[positions[-2] + 1:]
	subdomains = []
	for url in urls:
	suburl = urlparse(url)
	subdomain = suburl.netloc
	#print(subdomain)
	if subdomain.strip() == "": continue
	if miandomain in subdomain:
	if subdomain not in subdomains:
	subdomains.append(subdomain)
	return subdomains

	def find_by_url_deep(url):
	html_raw = Extract_html(url)
	if html_raw == None:
	print("Fail to access " + url)
	return None
	html = BeautifulSoup(html_raw, "html.parser")
	html_as = html.findAll("a")
	links = []
	for html_a in html_as:
	src = html_a.get("href")
	if src == "" or src == None: continue
	link = process_url(url, src)
	if link not in links:
	links.append(link)
	if links == []: return None
	print("ALL Find " + str(len(links)) + " links")
	urls = []
	i = len(links)
	for link in links:
	temp_urls = find_by_url(link)
	if temp_urls == None: continue
	print("Remaining " + str(i) + " \| Find " + str(len(temp_urls)) + " URL in " + link)
	for temp_url in temp_urls:
	if temp_url not in urls:
	urls.append(temp_url)
	i -= 1
	return urls


	def find_by_file(file_path, js=False):
	with open(file_path, "r") as fobject:
	links = fobject.read().split("\n")
	if links == []: return None
	print("ALL Find " + str(len(links)) + " links")
	urls = []
	i = len(links)
	for link in links:
	if js == False:
	temp_urls = find_by_url(link)
	else:
	temp_urls = find_by_url(link, js=True)
	if temp_urls == None: continue
	print(str(i) + " Find " + str(len(temp_urls)) + " URL in " + link)
	for temp_url in temp_urls:
	if temp_url not in urls:
	urls.append(temp_url)
	i -= 1
	return urls

	def giveresult(urls, domian):
	if urls == None:
	return None
	print("Find " + str(len(urls)) + " URL:")
	content_url = ""
	content_subdomain = ""
	for url in urls:
	content_url += url + "\n"
	print(url)
	subdomains = find_subdomain(urls, domian)
	print("\nFind " + str(len(subdomains)) + " Subdomain:")
	for subdomain in subdomains:
	content_subdomain += subdomain + "\n"
	print(subdomain)
	if args.outputurl != None:
	with open(args.outputurl, "a", encoding='utf-8') as fobject:
	fobject.write(content_url)
	print("\nOutput " + str(len(urls)) + " urls")
	print("Path:" + args.outputurl)
	if args.outputsubdomain != None:
	with open(args.outputsubdomain, "a", encoding='utf-8') as fobject:
	fobject.write(content_subdomain)
	print("\nOutput " + str(len(subdomains)) + " subdomains")
	print("Path:" + args.outputsubdomain)

	if __name__ == "__main__":
	urllib3.disable_warnings()
	args = parse_args()
	if args.file == None:
	if args.deep is not True:
	urls = find_by_url(args.url)
	giveresult(urls, args.url)
	else:
	urls = find_by_url_deep(args.url)
	giveresult(urls, args.url)
	else:
	if args.js is not True:
	urls = find_by_file(args.file)
	giveresult(urls, urls[0])
	else:
	urls = find_by_file(args.file, js = True)
	giveresult(urls, urls[0])

【渗透-信息收集】JSFinder脚本爬取网站js数据的使用

1.介绍：

2.下载和使用

2.1 下载JSFinder脚本

2.2 使用

3.脚本源码

微信扫一扫：分享