YoloV3 结果对每个 class 置信度为零
YoloV3 Result Giving Zero confidence in every class
我正在为多 class 对象检测实施 Yolo v3
yolo 是基于算法的 region proposal
并将具有最大置信度的区域提案视为 yolo 的预测
有关更多信息,您可以阅读有关 here
的内容
对于这个特定的任务,我参考了这个murtuza tutorial,它指导我从头开始
现在复杂的网络架构需要数小时的训练
我更喜欢使用迁移学习,因为使用预训练网络和权重(参数)
这两个 link 你都可以在这里找到
架构配置:cfg
网络参数(权重):weights
我在这里使用 yolov3 tiny 因为我需要更高的帧率来处理视频
但毕竟它没有给出有希望的
结果如教程所示
我没有我缺乏的地方
但即使将网络 cfg 和权重文件更改为原始 yolov3(320) 也不会给出真实结果
因为我将所有 5 个空间数据作为坐标和置信度
[cx,cy,h,w,confidence] 但所有 80 classes 概率仍然是零向量 [0.0,0.0,0.0---0.0]
甚至改变视频源并选择另一个视频导致零向量
在教程中工作正常
Implementation Code:
# YOLO Algorithm
# Network Weights and configuration Files
yolov3_tiny_cfg='/root/Downloads/ML TASK/yolov3-tiny.cfg' # configuration file
yolov3_tiny_weights='/root/Downloads/ML TASK/yolov3-tiny.weights' # weights
coco_names='/root/Downloads/ML TASK/coco.names' # coco classes
# for yolo genral 320 architecture
# put paths to directory
yolov3_cfg='/root/Downloads/ML TASK/yolov3.cfg'
yolov3_weights='/root/Downloads/ML TASK/yolov3.weights'
# Test Videos
Test_video_1='/root/Downloads/ML TASK/mn.mp4'
Test_video_2='/root/Downloads/ML TASK/bg.mp4'
# Dependencies
import cv2
import numpy as np
# Dataset Classes:
# there are around 80 classes in the coco dataset so manually writing them would not be right choice so instead of them we are getting them from a file name coco.names stored in drive
# getting list of classes
classes=[] # empty list intialization
with open(coco_names,'r')as f:
classes=f.read().splitlines()
# viewing the multiclass list around 80 classes in coco dataset
# Loading the yolov3 using configuration file and weights
network=cv2.dnn.readNetFromDarknet(yolov3_cfg,yolov3_weights)
network.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)# to use opencv CPU as backend
network.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
#NOTE: The network won't feed directly the image we have to First Preprocess it To match the input shape of network also the type i.e. Blob it genrally refers to a mathematical form of binary Images Like Bitmap
Width,Height=320,320 # sqaure image so the network grid should be n*n equal on both dimension
Confidence_Threshould=0.5 # minimum problity for claiming the prediction
NMS_Threshould=0.3
cap=cv2.VideoCapture('game.mp4')
fps = cap.get(cv2.CAP_PROP_FPS)
timestamps = [cap.get(cv2.CAP_PROP_POS_MSEC)]
# function to find objects on captured video stream
def findObjects(outputs,image):
h,w,c=image.shape
bound_box=[] # for feeding through function
classIds=[]
confidence=[]
for output in outputs: # getting o/p from 2 layers(v3 tiny) 3 if use yolov3 320
for detection in output:
scores=detection[5:] #slice first five values cause we are gonnause them in bounding
classId=np.argmax(scores)
confs=scores[classIds]
# filtering object putting them as final prediction only when its breaches the minimum threshould of confidence
if confs > Confidence_Threshould:
w,h=int(detection[2]*Width),int(detection[3]*Height) # to convert % into pixel
x,y=int((detection[0]*Width)-(w/2)),int((detection[1]*Height)-(h/2))
bound_box.append([x,y,w,h])
classIds.append(classId)
confidence.append(float(confs))
print(len(bound_box))
# to downsample the no. of boxes on frame we use nms boxes it give indices by which spatial info to keep
indices=cv2.dnn.NMSBoxes(bound_box,confs,Confidence_Threshould,NMS_Threshould)
for i in indices:
i=i[0]
box=bound_box[i]
x,y,w,h=box[0],box[1],box[2],box[3]
cv2.rectangle(image,(x,y),(x+w,y+h),(255,0,0),2)
cv2.puttext(image,f'{classes[classIds[i]]}{int(confidence[i]*100)}%',
(x,y-10),cv2.FONT_HERSHEY_PLAIN,0.6,(0,255,0),2)
cv2.puttext(image,f'FPS:{fps}',(0,150),cv2.FONT_HERSHEY_PLAIN,0.6,(0,255,0),2)
cv2.puttext(image,f'TIMESTAMPS:{timestamps}',(150,0),cv2.FONT_HERSHEY_PLAIN,0.6,(0,255,0),2)
while True:
success,image=cap.read()
# coverting image into blob for network i/O processing
try:
blob=cv2.dnn.blobFromImage(image,1/255,(Width,Height),[0,0,0],crop=False)
except:
continue
# I/P
network.setInput(blob) # Setting Input
# O/P
# As Yolo Architecture Produces Three O/p[Genral Architecture] From The Respective Layer And By Summarize The Max Of Confidence to Decide Final Predictions
# But here only 2 o/p of network as we are using the tiny version for higher frame rates
# In Order to Get The Outputs We Have To Know the Name Of the Respective Layers #i.e. Not Names Actually But Getting indexes(starting from 1 Not zero) Here By Use Of getUnconnectedOutLayers Function
layers_names=network.getLayerNames()
#print(network.getUnconnectedOutLayers()) #36th and 48th indexes
#looping over as we are traversing multiple values of OutLayers
outputNames=[layers_names[i[0]-1]for i in network.getUnconnectedOutLayers()] #-1 cause the index are starting from one not zero
#print(outputNames) # for v3 tiny its 16 and 23 are layer name
# forwading the image to network
outputs=network.forward(outputNames)
# finding objects
# print(outputs[0].shape)=>(300,85) 300=>no.of boxes 85=>[cx,cy,height,width,confidence,probablity of 80 classes]
# using the cx,cy,h,w we are gonna determine the bounding box
# print(outputs[1].shape)=>(1200,85) 1200 boxes this shape present in m*n format i.e. matrix faishion where 1200 rows of boxes map with 85 vector details explained aboved
#print(outputs[0][0])
findObjects(outputs,image)
cv2.imshow('Window',image)
if cv2.waitKey(15) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
你的代码有很多问题。
- 您必须使用从图像中获得的 h,w,而不是用于为 YoloV3 图像生成斑点的默认宽度和高度。
改变
w,h=int(detection[2]*Width),int(detection[3]*Height)
x,y=int((detection[0]*Width)-(w/2)),int((detection[1]*Height)-(h/2))
到
w,h = int(det[2]*w) , int(det[3]*h)
x,y = int((det[0]*w)-Width/2) , int((det[1]*h)-Height/2)
- 你在 confs 和 confidence 之间混淆了很多,这很混乱,你可以查看 murtaza 教程,但这需要一些时间。
可能还有一些我漏掉的小错误。
------------------------------------最终解决方案:--- ------------------------------
为了节省您的时间这是您项目的正确代码风格。
注意 1:我稍微更改了 coco.names 标签加载方法,您的方法在我的 Macbook Pro 上效果不佳。
注意 2:在我的代码中,您必须像在原始代码中一样将文件路径更改回您的路径。
yolov3_cfg='/root/Downloads/ML TASK/yolov3.cfg'
yolov3_weights='/root/Downloads/ML TASK/yolov3.weights'
import cv2 as cv
import numpy as np
cap = cv.VideoCapture("video.mp4")
whT = 320
confThreshold =0.5
nmsThreshold= 0.2
#### LOAD MODEL
## Coco Names
classesFile = "coco.names"
classNames = open(classesFile).read().strip().split("\n")
print(classNames)
## Model Files
modelConfiguration = "yolov3.cfg"
modelWeights = "yolov3.weights"
net = cv.dnn.readNetFromDarknet(modelConfiguration, modelWeights)
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
def findObjects(outputs,img):
hT, wT, cT = img.shape
bbox = []
classIds = []
confs = []
for output in outputs:
for det in output:
scores = det[5:]
classId = np.argmax(scores)
confidence = scores[classId]
if confidence > confThreshold:
w,h = int(det[2]*wT) , int(det[3]*hT)
x,y = int((det[0]*wT)-w/2) , int((det[1]*hT)-h/2)
bbox.append([x,y,w,h])
classIds.append(classId)
confs.append(float(confidence))
indices = cv.dnn.NMSBoxes(bbox, confs, confThreshold, nmsThreshold)
for i in indices:
i = i[0]
box = bbox[i]
x, y, w, h = box[0], box[1], box[2], box[3]
# print(x,y,w,h)
cv.rectangle(img, (x, y), (x+w,y+h), (255, 0 , 255), 2)
cv.putText(img,f'{classNames[classIds[i]].upper()} {int(confs[i]*100)}%',
(x, y-10), cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 2)
while True:
success, img = cap.read()
blob = cv.dnn.blobFromImage(img, 1 / 255, (whT, whT), [0, 0, 0], 1, crop=False)
net.setInput(blob)
layersNames = net.getLayerNames()
outputNames = [(layersNames[i[0] - 1]) for i in net.getUnconnectedOutLayers()]
outputs = net.forward(outputNames)
findObjects(outputs,img)
cv.imshow('Image', img)
cv.waitKey(1)
我正在为多 class 对象检测实施 Yolo v3
yolo 是基于算法的 region proposal 并将具有最大置信度的区域提案视为 yolo 的预测 有关更多信息,您可以阅读有关 here
的内容对于这个特定的任务,我参考了这个murtuza tutorial,它指导我从头开始
现在复杂的网络架构需要数小时的训练
我更喜欢使用迁移学习,因为使用预训练网络和权重(参数)
这两个 link 你都可以在这里找到
架构配置:cfg
网络参数(权重):weights
我在这里使用 yolov3 tiny 因为我需要更高的帧率来处理视频 但毕竟它没有给出有希望的 结果如教程所示 我没有我缺乏的地方 但即使将网络 cfg 和权重文件更改为原始 yolov3(320) 也不会给出真实结果 因为我将所有 5 个空间数据作为坐标和置信度 [cx,cy,h,w,confidence] 但所有 80 classes 概率仍然是零向量 [0.0,0.0,0.0---0.0] 甚至改变视频源并选择另一个视频导致零向量 在教程中工作正常
Implementation Code:
# YOLO Algorithm
# Network Weights and configuration Files
yolov3_tiny_cfg='/root/Downloads/ML TASK/yolov3-tiny.cfg' # configuration file
yolov3_tiny_weights='/root/Downloads/ML TASK/yolov3-tiny.weights' # weights
coco_names='/root/Downloads/ML TASK/coco.names' # coco classes
# for yolo genral 320 architecture
# put paths to directory
yolov3_cfg='/root/Downloads/ML TASK/yolov3.cfg'
yolov3_weights='/root/Downloads/ML TASK/yolov3.weights'
# Test Videos
Test_video_1='/root/Downloads/ML TASK/mn.mp4'
Test_video_2='/root/Downloads/ML TASK/bg.mp4'
# Dependencies
import cv2
import numpy as np
# Dataset Classes:
# there are around 80 classes in the coco dataset so manually writing them would not be right choice so instead of them we are getting them from a file name coco.names stored in drive
# getting list of classes
classes=[] # empty list intialization
with open(coco_names,'r')as f:
classes=f.read().splitlines()
# viewing the multiclass list around 80 classes in coco dataset
# Loading the yolov3 using configuration file and weights
network=cv2.dnn.readNetFromDarknet(yolov3_cfg,yolov3_weights)
network.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)# to use opencv CPU as backend
network.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
#NOTE: The network won't feed directly the image we have to First Preprocess it To match the input shape of network also the type i.e. Blob it genrally refers to a mathematical form of binary Images Like Bitmap
Width,Height=320,320 # sqaure image so the network grid should be n*n equal on both dimension
Confidence_Threshould=0.5 # minimum problity for claiming the prediction
NMS_Threshould=0.3
cap=cv2.VideoCapture('game.mp4')
fps = cap.get(cv2.CAP_PROP_FPS)
timestamps = [cap.get(cv2.CAP_PROP_POS_MSEC)]
# function to find objects on captured video stream
def findObjects(outputs,image):
h,w,c=image.shape
bound_box=[] # for feeding through function
classIds=[]
confidence=[]
for output in outputs: # getting o/p from 2 layers(v3 tiny) 3 if use yolov3 320
for detection in output:
scores=detection[5:] #slice first five values cause we are gonnause them in bounding
classId=np.argmax(scores)
confs=scores[classIds]
# filtering object putting them as final prediction only when its breaches the minimum threshould of confidence
if confs > Confidence_Threshould:
w,h=int(detection[2]*Width),int(detection[3]*Height) # to convert % into pixel
x,y=int((detection[0]*Width)-(w/2)),int((detection[1]*Height)-(h/2))
bound_box.append([x,y,w,h])
classIds.append(classId)
confidence.append(float(confs))
print(len(bound_box))
# to downsample the no. of boxes on frame we use nms boxes it give indices by which spatial info to keep
indices=cv2.dnn.NMSBoxes(bound_box,confs,Confidence_Threshould,NMS_Threshould)
for i in indices:
i=i[0]
box=bound_box[i]
x,y,w,h=box[0],box[1],box[2],box[3]
cv2.rectangle(image,(x,y),(x+w,y+h),(255,0,0),2)
cv2.puttext(image,f'{classes[classIds[i]]}{int(confidence[i]*100)}%',
(x,y-10),cv2.FONT_HERSHEY_PLAIN,0.6,(0,255,0),2)
cv2.puttext(image,f'FPS:{fps}',(0,150),cv2.FONT_HERSHEY_PLAIN,0.6,(0,255,0),2)
cv2.puttext(image,f'TIMESTAMPS:{timestamps}',(150,0),cv2.FONT_HERSHEY_PLAIN,0.6,(0,255,0),2)
while True:
success,image=cap.read()
# coverting image into blob for network i/O processing
try:
blob=cv2.dnn.blobFromImage(image,1/255,(Width,Height),[0,0,0],crop=False)
except:
continue
# I/P
network.setInput(blob) # Setting Input
# O/P
# As Yolo Architecture Produces Three O/p[Genral Architecture] From The Respective Layer And By Summarize The Max Of Confidence to Decide Final Predictions
# But here only 2 o/p of network as we are using the tiny version for higher frame rates
# In Order to Get The Outputs We Have To Know the Name Of the Respective Layers #i.e. Not Names Actually But Getting indexes(starting from 1 Not zero) Here By Use Of getUnconnectedOutLayers Function
layers_names=network.getLayerNames()
#print(network.getUnconnectedOutLayers()) #36th and 48th indexes
#looping over as we are traversing multiple values of OutLayers
outputNames=[layers_names[i[0]-1]for i in network.getUnconnectedOutLayers()] #-1 cause the index are starting from one not zero
#print(outputNames) # for v3 tiny its 16 and 23 are layer name
# forwading the image to network
outputs=network.forward(outputNames)
# finding objects
# print(outputs[0].shape)=>(300,85) 300=>no.of boxes 85=>[cx,cy,height,width,confidence,probablity of 80 classes]
# using the cx,cy,h,w we are gonna determine the bounding box
# print(outputs[1].shape)=>(1200,85) 1200 boxes this shape present in m*n format i.e. matrix faishion where 1200 rows of boxes map with 85 vector details explained aboved
#print(outputs[0][0])
findObjects(outputs,image)
cv2.imshow('Window',image)
if cv2.waitKey(15) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
你的代码有很多问题。
- 您必须使用从图像中获得的 h,w,而不是用于为 YoloV3 图像生成斑点的默认宽度和高度。
改变
w,h=int(detection[2]*Width),int(detection[3]*Height)
x,y=int((detection[0]*Width)-(w/2)),int((detection[1]*Height)-(h/2))
到
w,h = int(det[2]*w) , int(det[3]*h)
x,y = int((det[0]*w)-Width/2) , int((det[1]*h)-Height/2)
- 你在 confs 和 confidence 之间混淆了很多,这很混乱,你可以查看 murtaza 教程,但这需要一些时间。
可能还有一些我漏掉的小错误。
------------------------------------最终解决方案:--- ------------------------------
为了节省您的时间这是您项目的正确代码风格。
注意 1:我稍微更改了 coco.names 标签加载方法,您的方法在我的 Macbook Pro 上效果不佳。
注意 2:在我的代码中,您必须像在原始代码中一样将文件路径更改回您的路径。
yolov3_cfg='/root/Downloads/ML TASK/yolov3.cfg'
yolov3_weights='/root/Downloads/ML TASK/yolov3.weights'
import cv2 as cv
import numpy as np
cap = cv.VideoCapture("video.mp4")
whT = 320
confThreshold =0.5
nmsThreshold= 0.2
#### LOAD MODEL
## Coco Names
classesFile = "coco.names"
classNames = open(classesFile).read().strip().split("\n")
print(classNames)
## Model Files
modelConfiguration = "yolov3.cfg"
modelWeights = "yolov3.weights"
net = cv.dnn.readNetFromDarknet(modelConfiguration, modelWeights)
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
def findObjects(outputs,img):
hT, wT, cT = img.shape
bbox = []
classIds = []
confs = []
for output in outputs:
for det in output:
scores = det[5:]
classId = np.argmax(scores)
confidence = scores[classId]
if confidence > confThreshold:
w,h = int(det[2]*wT) , int(det[3]*hT)
x,y = int((det[0]*wT)-w/2) , int((det[1]*hT)-h/2)
bbox.append([x,y,w,h])
classIds.append(classId)
confs.append(float(confidence))
indices = cv.dnn.NMSBoxes(bbox, confs, confThreshold, nmsThreshold)
for i in indices:
i = i[0]
box = bbox[i]
x, y, w, h = box[0], box[1], box[2], box[3]
# print(x,y,w,h)
cv.rectangle(img, (x, y), (x+w,y+h), (255, 0 , 255), 2)
cv.putText(img,f'{classNames[classIds[i]].upper()} {int(confs[i]*100)}%',
(x, y-10), cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 2)
while True:
success, img = cap.read()
blob = cv.dnn.blobFromImage(img, 1 / 255, (whT, whT), [0, 0, 0], 1, crop=False)
net.setInput(blob)
layersNames = net.getLayerNames()
outputNames = [(layersNames[i[0] - 1]) for i in net.getUnconnectedOutLayers()]
outputs = net.forward(outputNames)
findObjects(outputs,img)
cv.imshow('Image', img)
cv.waitKey(1)