之前需要采集某某论坛的图片,原本是PHP的,不支持多线程明显速度比较慢,但很稳定。昨天试着用Py3重写了个,加入多线程,感觉非常不稳定,可能线程限制有点问题,外带异常处理也不大好。
Py代码还是非常简洁和优雅的,这就不说了。只是看我能否写得那么优雅了。
#!/usr/bin/env python3
import urllib.request
import urllib.error
import threading
import re
import os
import hashlib
import time
import sys
root_url = "{Your URL}" # 要访问的论坛的根地址,必须以斜杠结尾,如http://bbs.xx.com/
root_path = "{Your Path}" # 保存的文件夹,必须以斜杠结尾,如/home/xxx/pic/
FORUM_ID = {Your ID} # 论坛页面ID序号 ,如http://bbs.xx.com/forum-9-1.html, 表示其中的9,1为页面号
pthread_number = 0 # 当前线程数量
LOCK = threading.Lock() # 线程锁
p_map = {} # 队列处理词典
def waiting(): # 最大线程数量检查,貌似有点小问题
while pthread_number >= 40:
time.sleep(0.5)
def md5(string): # 获取一个字符串的MD5值,需要进行encode()编码
md5 = hashlib.md5()
md5.update(string)
return md5.hexdigest()
def get_page(url): # 抓取一个网页
rsp = urllib.request.urlopen(url)
page = rsp.read()
return bytes.decode(page, "GBK").replace('\n', '').replace('\r', '')
class DownloadPicture(threading.Thread): # 下载图片的线程
def __init__(self, url, path):
threading.Thread.__init__(self)
self.url = url
self.path = path
def run(self):
global pthread_number
global p_map
# 开始图片下载,添加临界值
LOCK.acquire()
pthread_number += 1
p_map[md5(self.url.encode())] = self.url
LOCK.release()
try:
req = urllib.request.urlopen(self.url)
data = req.read()
if data is '':
return
f = open(self.path, 'wb')
f.write(data)
f.close()
print(self.path)
except urllib.error.HTTPError as e:
print(self.path + ", http error")
except: # 异常处理不靠谱
print(self.path + ", error")
finally:
# 结束清理工作
LOCK.acquire()
pthread_number -= 1
del (p_map[md5(self.url.encode())])
LOCK.release()
class DownloadPage(threading.Thread): # 开始读取每一个页面的帖子列表
def __init__(self, page, name, n):
threading.Thread.__init__(self)
self.name = self.filter_name(name)
self.number_flag = n
self.page = root_url + page
self.path = root_path + md5(self.name.encode("GBK")).lower()[:2] + "/"
# 开始文件夹的创建
if os.path.isdir(self.path) == False:
os.mkdir(self.path)
self.path = self.path + self.name + "/"
if os.path.isdir(self.path) == False:
os.mkdir(self.path)
def filter_name(self, name): # 过滤文件夹的名字,替换特殊字符
for i in ['/', '\\', ':', '*', '?', '"', '<', '>', '|', "'", ')']:
name = name.replace(i, '_')
return name
def run(self):
page = get_page(self.page)
match = re.findall('<td class="t_f" id="postmessage_[0-9]+">(.*?)</td>', page) # 查找帖子中的内容列表
if len(match) < 1:
return
page = match[0]
matches = re.findall('src="(http[\S]+)"', page) # 查找连接
if len(matches) > 0:
list = []
for url in matches:
name = self.get_basename(url) # 获取名字
if os.path.isfile(self.path + name):
print(self.path + name + ", is exists")
continue # 对于存在的文件直接跳过
waiting() # 线程检查
obj = DownloadPicture(url, self.path + name)
list += [obj]
obj.start()
for obj in list: # 判断线程是否结束
obj.join()
self.check_dir()
else:
print(self.name, ", no picture")
def get_basename(self, url): # 依据URL获取文件名,当出现错误名称后直接使用MD5名称
name = os.path.basename(url)
ex = os.path.splitext(name)
if ex[1] == '' or self.is_error_name(name):
name = md5(url.encode()) + ".jpg"
return name
def is_error_name(self, name): # 判断是否有错误字符
for i in ['/', '\\', ':', '*', '?', '"', '<', '>', '|', "'", ')']:
if name.find(i) > -1:
return True
return False
def check_dir(self): # 检测文件夹是否为空等情况
# TODO 添加其他操作
print(self.page, self.path, ":", self.number_flag, " check finish")
class ImageCollect(threading.Thread):
def __init__(self, fid, begin, end=1): # 接收参数
threading.Thread.__init__(self)
self.fid = fid
self.begin = begin
self.end = end
def process_page(self, url):
page = get_page(url)
# 开始通过正则查找连接和名称
return re.findall(
'<a href="(thread-[0-9]+-[0-9]+-[0-9]+\.html)" onclick="atarget\(this\)" class="xst" >(.*?)</a>',
page)
def run(self):
global pthread_number
if os.path.isdir(root_path) == False:
os.mkdir(root_path)
for i in range(self.begin, self.end + self.begin):
url = "%sforum-%s-%s.html" % (root_url, self.fid, i)
list = []
all_page = self.process_page(url)
n = len(all_page)
for obj in all_page:
waiting()
thr = DownloadPage(obj[0], obj[1], n)
list += [thr]
thr.start()
n -= 1
for obj in list:
obj.join()
print("\nOVER finish")
pthread_number = -1
if __name__ == '__main__':
if len(sys.argv) != 3: # 参数检查,开始页,要加载的页
print("Usage: " + sys.argv[0] + " begin_page number")
sys.exit(2)
ic = ImageCollect(FORUM_ID, int(sys.argv[1]), int(sys.argv[2]))
ic.start()
while True: # 每五秒检查一次队列状态
LOCK.acquire()
if pthread_number == -1:
break
print("TH:", pthread_number)
for obj in p_map:
print("TH:", p_map[obj])
LOCK.release()
time.sleep(5)
import urllib.request
import urllib.error
import threading
import re
import os
import hashlib
import time
import sys
root_url = "{Your URL}" # 要访问的论坛的根地址,必须以斜杠结尾,如http://bbs.xx.com/
root_path = "{Your Path}" # 保存的文件夹,必须以斜杠结尾,如/home/xxx/pic/
FORUM_ID = {Your ID} # 论坛页面ID序号 ,如http://bbs.xx.com/forum-9-1.html, 表示其中的9,1为页面号
pthread_number = 0 # 当前线程数量
LOCK = threading.Lock() # 线程锁
p_map = {} # 队列处理词典
def waiting(): # 最大线程数量检查,貌似有点小问题
while pthread_number >= 40:
time.sleep(0.5)
def md5(string): # 获取一个字符串的MD5值,需要进行encode()编码
md5 = hashlib.md5()
md5.update(string)
return md5.hexdigest()
def get_page(url): # 抓取一个网页
rsp = urllib.request.urlopen(url)
page = rsp.read()
return bytes.decode(page, "GBK").replace('\n', '').replace('\r', '')
class DownloadPicture(threading.Thread): # 下载图片的线程
def __init__(self, url, path):
threading.Thread.__init__(self)
self.url = url
self.path = path
def run(self):
global pthread_number
global p_map
# 开始图片下载,添加临界值
LOCK.acquire()
pthread_number += 1
p_map[md5(self.url.encode())] = self.url
LOCK.release()
try:
req = urllib.request.urlopen(self.url)
data = req.read()
if data is '':
return
f = open(self.path, 'wb')
f.write(data)
f.close()
print(self.path)
except urllib.error.HTTPError as e:
print(self.path + ", http error")
except: # 异常处理不靠谱
print(self.path + ", error")
finally:
# 结束清理工作
LOCK.acquire()
pthread_number -= 1
del (p_map[md5(self.url.encode())])
LOCK.release()
class DownloadPage(threading.Thread): # 开始读取每一个页面的帖子列表
def __init__(self, page, name, n):
threading.Thread.__init__(self)
self.name = self.filter_name(name)
self.number_flag = n
self.page = root_url + page
self.path = root_path + md5(self.name.encode("GBK")).lower()[:2] + "/"
# 开始文件夹的创建
if os.path.isdir(self.path) == False:
os.mkdir(self.path)
self.path = self.path + self.name + "/"
if os.path.isdir(self.path) == False:
os.mkdir(self.path)
def filter_name(self, name): # 过滤文件夹的名字,替换特殊字符
for i in ['/', '\\', ':', '*', '?', '"', '<', '>', '|', "'", ')']:
name = name.replace(i, '_')
return name
def run(self):
page = get_page(self.page)
match = re.findall('<td class="t_f" id="postmessage_[0-9]+">(.*?)</td>', page) # 查找帖子中的内容列表
if len(match) < 1:
return
page = match[0]
matches = re.findall('src="(http[\S]+)"', page) # 查找连接
if len(matches) > 0:
list = []
for url in matches:
name = self.get_basename(url) # 获取名字
if os.path.isfile(self.path + name):
print(self.path + name + ", is exists")
continue # 对于存在的文件直接跳过
waiting() # 线程检查
obj = DownloadPicture(url, self.path + name)
list += [obj]
obj.start()
for obj in list: # 判断线程是否结束
obj.join()
self.check_dir()
else:
print(self.name, ", no picture")
def get_basename(self, url): # 依据URL获取文件名,当出现错误名称后直接使用MD5名称
name = os.path.basename(url)
ex = os.path.splitext(name)
if ex[1] == '' or self.is_error_name(name):
name = md5(url.encode()) + ".jpg"
return name
def is_error_name(self, name): # 判断是否有错误字符
for i in ['/', '\\', ':', '*', '?', '"', '<', '>', '|', "'", ')']:
if name.find(i) > -1:
return True
return False
def check_dir(self): # 检测文件夹是否为空等情况
# TODO 添加其他操作
print(self.page, self.path, ":", self.number_flag, " check finish")
class ImageCollect(threading.Thread):
def __init__(self, fid, begin, end=1): # 接收参数
threading.Thread.__init__(self)
self.fid = fid
self.begin = begin
self.end = end
def process_page(self, url):
page = get_page(url)
# 开始通过正则查找连接和名称
return re.findall(
'<a href="(thread-[0-9]+-[0-9]+-[0-9]+\.html)" onclick="atarget\(this\)" class="xst" >(.*?)</a>',
page)
def run(self):
global pthread_number
if os.path.isdir(root_path) == False:
os.mkdir(root_path)
for i in range(self.begin, self.end + self.begin):
url = "%sforum-%s-%s.html" % (root_url, self.fid, i)
list = []
all_page = self.process_page(url)
n = len(all_page)
for obj in all_page:
waiting()
thr = DownloadPage(obj[0], obj[1], n)
list += [thr]
thr.start()
n -= 1
for obj in list:
obj.join()
print("\nOVER finish")
pthread_number = -1
if __name__ == '__main__':
if len(sys.argv) != 3: # 参数检查,开始页,要加载的页
print("Usage: " + sys.argv[0] + " begin_page number")
sys.exit(2)
ic = ImageCollect(FORUM_ID, int(sys.argv[1]), int(sys.argv[2]))
ic.start()
while True: # 每五秒检查一次队列状态
LOCK.acquire()
if pthread_number == -1:
break
print("TH:", pthread_number)
for obj in p_map:
print("TH:", p_map[obj])
LOCK.release()
time.sleep(5)
回访咯
前來支持一下~高手啊!
赞一个
高手!感觉以后挖图可以用的上!
好长,但是我没看,哈哈!!
是我也懒得看
哈哈,不过我想采集妹纸图片
我本就是采集妹纸图片来着
python3赞一个