1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
| import requests from bs4 import BeautifulSoup from urllib.parse import urljoin from requests.auth import HTTPBasicAuth
AUTH = HTTPBasicAuth('tower', '123456') PROXIES = {"http": "http://127.0.0.1:8888"} BLACKLIST = ['enable', 'disable', 'start', 'stop', 'clear', 'quit', 'reset', 'export']
def simple_safe_crawl(url, visited=None): if visited is None: visited = set() if url in visited: return [] visited.add(url)
if any(word in url.lower() for word in BLACKLIST): return [f"{url}"]
try: resp = requests.get(url, auth=AUTH, proxies=PROXIES, timeout=3) soup = BeautifulSoup(resp.text, 'html.parser') all_found = [] for a in soup.find_all('a'): href = a.get('href') if not href or href.startswith('..'): continue full_url = urljoin(url, href) all_found.extend(simple_safe_crawl(full_url, visited) return all_found except: return [] if __name__ == "__main__": results = simple_safe_crawl("http://control.charles/") print("\n--- 最终扫描结果 ---") for r in sorted(list(set(results))): print(r)
|