我用Python写了个爬取链接图片视频的脚本

先来导入一些库

之前有人抱怨文章太死板了,后面我就用这个风格写文章了,因为第一次换风格,有什么妥当的评论区说出来吧。

一开始得先安装并导入一些需要的库,没啥好说的,都是爬虫开发必备的工具。

1
2
3
4
5
6
7
8
import requests
from bs4 import BeautifulSoup
import pyfiglet
import random
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from colorama import init, Fore
  • requests:用来请求网页。
  • BeautifulSoup:解析网页 HTML,找图片和视频链接。
  • pyfiglet:用来生成艺术字,输出个好看的标题。
  • fake_useragent:生成随机浏览器的User-Agent,避免被封。
  • colorama:给终端输出加点颜色,看起来酷一点。

给爬虫加点艺术

代码一开始,就让它有点“气质”,用 pyfiglet 给个大大的欢迎字。起码能让人眼前一亮。

1
2
3
4
init(autoreset=True)

art = pyfiglet.figlet_format("ZeTooL-Img")
print(art)

输出啥样的字就看这个设置了,起码能让人眼前一亮。

用户输入 URL

接下来让用户输入一个想要爬取的链接。这个链接就是爬虫的目标网页。

1
url = input("请输入要爬取的链接: ")

模拟浏览器请求

为了避免爬虫被封,伪装一下。用 fake_useragent 随机生成一个浏览器的 User-Agent,还有一个随机的 IP 地址。这样伪装成不同的浏览器、不同地区的用户,能增加不被封的几率。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
systems = [
"Windows NT 6.1; Win64; x64", # Windows 7
"Windows NT 10.0; Win64; x64", # Windows 10
"Android 8.0.0; Pixel 2", # Android 8
"iOS 16.0; iPhone 13" # iOS 16
]
system = random.choice(systems)

# 随机生成一个中国IP
ip = f"223.104.{random.randint(0, 255)}.{random.randint(0, 255)}"
ua = UserAgent()
user_agent = ua.random # 随机生成一个User-Agent

# 请求头设置
headers = {
"User-Agent": user_agent,
"X-Forwarded-For": ip,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
}

有了这些伪装,爬虫才不容易被发现。

设置请求重试机制

网络不好或者网站服务器有问题,可能会导致请求失败。为了避免脚本崩溃,加个重试机制。如果请求失败,会自动再试 3 次,避免一不小心就把程序搞崩。

1
2
3
4
5
6
7
8
9
10
11
# 请求重试机制
retry_strategy = Retry(
total=3, # 最多重试3次
backoff_factor=1, # 每次重试间隔会越来越长
status_forcelist=[500, 502, 503, 504, 404, 403], # 这些错误会触发重试
allowed_methods=["HEAD", "GET", "OPTIONS"] # 仅对GET、HEAD、OPTIONS方法重试
)

session = requests.Session()
session.mount("http://", HTTPAdapter(max_retries=retry_strategy))
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))

定义爬取网页的函数

写个函数来发请求、获取网页。出现错误的时候,显示一些友好的错误信息。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def fetch_page(url):
try:
response = session.get(url, headers=headers, timeout=10) # 请求超时设置为10秒
response.raise_for_status() # 如果网页返回错误码,抛出异常
return response
except requests.exceptions.Timeout:
print(Fore.RED + "错误: 请求超时,请检查网络连接或稍后重试。")
return None
except requests.exceptions.TooManyRedirects:
print(Fore.RED + "错误: 请求的URL出现重定向过多的情况。")
return None
except requests.exceptions.HTTPError as http_err:
print(Fore.RED + f"HTTP 错误: {http_err}")
return None
except requests.exceptions.RequestException as e:
print(Fore.RED + f"请求出错: {e}")
return None

这样能确保网页请求成功才会继续执行后续操作。

解析网页,抓取图片和视频链接

用 BeautifulSoup 来解析 HTML 页面,抓取所有<img><video>标签里的src属性。如果是相对路径,就拼成完整的 URL。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
soup = BeautifulSoup(response.text, 'html.parser')

image_links = [] # 存图片链接
video_links = [] # 存视频链接

# 提取图片链接
for img_tag in soup.find_all('img'):
img_src = img_tag.get('src')
if img_src:
if not img_src.startswith('http'): # 拼接相对路径
img_src = url + img_src
image_links.append(img_src)
print(Fore.GREEN + f"[Fetch Log] Image Link: {img_src}")

# 提取视频链接
for video_tag in soup.find_all('video'):
video_src = video_tag.get('src')
if video_src:
if not video_src.startswith('http'): # 拼接相对路径
video_src = url + video_src
video_links.append(video_src)
print(Fore.GREEN + f"[Fetch Log] Video Link: {video_src}")

保存链接到文件

把抓取到的图片和视频链接保存到 media_links.txt 文件中。保存格式简单清晰,后面可以方便地处理。

1
2
3
4
5
6
7
8
9
10
with open('media_links.txt', 'w', encoding='utf-8') as file:
file.write('--- Image Links ---\n')
for img_link in image_links:
file.write(img_link + '\n')

file.write('\n--- Video Links ---\n')
for video_link in video_links:
file.write(video_link + '\n')

print(Fore.GREEN + '链接已经保存到 media_links.txt 文件中。')

附上代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import requests
from bs4 import BeautifulSoup
import pyfiglet
import random
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from colorama import init, Fore

# 佛祖保佑,永无bug
# ____
# / \
# / \
# | |
# | O O | 佛祖保佑
# | ^ |
# | --- |
# \______/
# / \
# /__________\
# | |
# | |
# | |
# (__) (__)

init(autoreset=True)

art = pyfiglet.figlet_format("ZeTooL-Img")
print(art)

url = input("请输入要爬取的链接: ")

systems = [
"Windows NT 6.1; Win64; x64",
"Windows NT 10.0; Win64; x64",
"Windows NT 6.3; Win64; x64",
"Android 8.0.0; Pixel 2",
"Android 9.0.0; Pixel 3",
"Android 10.0; Pixel 4",
"iOS 16.0; iPhone 13",
"iOS 17.0; iPhone 14",
"iOS 18.0; iPhone 15"
]
system = random.choice(systems)

ip = f"223.104.{random.randint(0, 255)}.{random.randint(0, 255)}"
ua = UserAgent()
user_agent = ua.random

headers = {
"User-Agent": user_agent,
"X-Forwarded-For": ip,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
}

retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[500, 502, 503, 504, 404, 403],
allowed_methods=["HEAD", "GET", "OPTIONS"]
)

session = requests.Session()
session.mount("http://", HTTPAdapter(max_retries=retry_strategy))
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))

def fetch_page(url):
try:
response = session.get(url, headers=headers, timeout=10)
response.raise_for_status()
return response
except requests.exceptions.Timeout:
print(Fore.RED + "错误: 请求超时,请检查网络连接或稍后重试。")
return None
except requests.exceptions.TooManyRedirects:
print(Fore.RED + "错误: 请求的URL出现重定向过多的情况。")
return None
except requests.exceptions.HTTPError as http_err:
if response.status_code == 404:
print(Fore.RED + "错误: 找不到页面 (404)。请检查URL是否正确。")
elif response.status_code == 403:
print(Fore.RED + "错误: 权限不足 (403)。访问被拒绝。")
else:
print(Fore.RED + f"HTTP 错误: {http_err}")
return None
except requests.exceptions.RequestException as e:
print(Fore.RED + f"请求出错: {e}")
return None

response = fetch_page(url)
if response is None:
print(Fore.RED + "无法爬取该链接,请检查URL或稍后再试。")
else:
soup = BeautifulSoup(response.text, 'html.parser')

image_links = []
video_links = []

for img_tag in soup.find_all('img'):
img_src = img_tag.get('src')
if img_src:
if not img_src.startswith('http'):
img_src = url + img_src
image_links.append(img_src)
print(Fore.GREEN + f"[Fetch Log] Image Link: {img_src}")

for video_tag in soup.find_all('video'):
video_src = video_tag.get('src')
if video_src:
if not video_src.startswith('http'):
video_src = url + video_src
video_links.append(video_src)
print(Fore.GREEN + f"[Fetch Log] Video Link: {video_src}")

for source_tag in video_tag.find_all('source'):
source_src = source_tag.get('src')
if source_src:
if not source_src.startswith('http'):
source_src = url + source_src
video_links.append(source_src)
print(Fore.GREEN + f"[Fetch Log] Video Link: {source_src}")

with open('media_links.txt', 'w', encoding='utf-8') as file:
file.write('--- Image Links ---\n')
for img_link in image_links:
file.write(img_link + '\n')

file.write('\n--- Video Links ---\n')
for video_link in video_links:
file.write(video_link + '\n')

print(Fore.GREEN + '链接已经保存到 media_links.txt 文件中。')

梦泽Hexo文章模板

我用Python写了个爬取链接图片视频的脚本

https://mengze.vip/10

作者

梦泽

发布于

2025-01-19

更新于

2025-03-12

许可协议