如何使输入类型和权重类型相同？

Question

我收到一个运行时错误，提示输入和权重必须相同。但是我确保我的模型和输入在同一台设备上，但我无法摆脱错误。据我所知，我知道我的输入数据不在 GPU 上。因为，在这种情况下，图像是输入，所以我尝试了 img = torch.from_numpy(img).to(device) 和 pred = model(img)[0].to(device 但没有成功。请告诉我可以做什么。

代码如下：

source = '0'
webcam = source == '0'
image_size = 640
imgsz = check_img_size(image_size)
# Load the model
filepath = 'weights/mask.pt'
# device = torch.device('cpu')
device = select_device()
# half = device.type != 'cpu'
model = attempt_load(filepath, map_location = device)
model.to(device).eval()
# if half:
#     model.half()

# Second stage classifier 
classify = False
if classify:
    modelc = torch_utils.load_classifier(name = 'resnet101', n = 2)
    modelc.load_state_dict(torch.load('weights/resnet101.pt', map_location = device)['modelc']) ###########
    modelc.to(device).eval()

vid_path, vid_writer = None, None
if webcam:
    view_img = True
    cudnn.benchmark = True
    dataset = LoadStreams(source, img_size = imgsz)
    
    
names = model.module.names if hasattr(model, 'module') else model.names
print(names)


def process_image(image):
    h, w = image.shape[:2]
    desired_size = 416
    ratio = desired_size/w
    print("Ratio",ratio)
    img = cv2.resize(image, (0, 0), fx = ratio, fy = ratio)
    h, w = img.shape[:2]
    img = cv2.copyMakeBorder(img, int((416-h)/2), int((416-h)/2), 0, 0, cv2.BORDER_CONSTANT)
    img = img[:, :, ::-1].transpose(2, 0, 1)
    img = np.ascontiguousarray(img)
    img = torch.from_numpy(img).to(device)
    img = img.float()
    img /=255.0
    if img.ndimension() == 3:
        img = img.unsqueeze(0)
    
    return img

def classify(image):
    # device = torch.device("cpu")
    
    #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    #im = Image.fromarray(image)
    img = process_image(image)
    print('Image processed')
    # img = image.unsqueeze_(0)
    # img = image.float()
    
    pred = model(img)[0]
    
    # Apply NMS
    pred = non_max_suppression(pred, 0.4, 0.5, classes = [0, 1, 2], agnostic = None )
    if classify:
        pred = apply_classifier(pred, modelc, img, im0s)
    print("1 ", pred)
    
    model.eval()
    model.cpu()

    classification = torch.cat(pred)[:, -1]
    if len(classification) == 0:
        return None
    
    index = int(classification[0])
    print(names[index])
    return names[index]

def detect(frame):
    source = '0'
    webcam = source == '0'
    
    image_size = 640
    imgsz = check_img_size(image_size)
    
    # Load model
    file_path = 'weights/yolov5s.pt'
    #device = torch.device('cpu')
    device = select_device()
    # half = device.type != 'cpu'
    model = attempt_load(file_path, map_location = device)
    model.to(device).eval()
    # if half:
    #     model.half()
    names = model.module.names if hasattr(model, 'module') else model.names
    colors = [[75, 125, 2]]
    img = process_image(frame)
    pred = model(img)[0]
    pred = non_max_suppression(pred, 0.4, 0.5, classes = [0], agnostic = None)
    if classify:
        pred = apply_classifier(pred, modelc, img, im0s)
    gn = torch.tensor(frame.shape)[[1,0,1,0]]    
    for i, det in enumerate(pred):
        det[:,:4] = scale_coords(img.shape[2:], det[:,:4], frame.shape).round()
        
        for *xyxy, conf, cls in reversed(det):
            xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
            label = '%s %.2f' % (names[int(cls)], conf)
            if label is not None:
                if (label.split())[0] == 'person':
                    plot_one_box(xyxy, frame, label = label, color = colors[0], line_thickness = 1)  # utils.general

主要代码如下：

with tf.Graph().as_default():
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False))
    with sess.as_default():
        pnet, rnet, onet = detect_face.create_mtcnn(sess, './models/')

        minsize = 20  # minimum size of face
        threshold = [0.6, 0.7, 0.7]  # three steps's threshold
        factor = 0.709  # scale factor
        margin = 44
        frame_interval = 3
        batch_size = 1000
        image_size = 182
        input_image_size = 160

        print('Loading feature extraction model')
        modeldir = './models/'
        facenet.load_model(modeldir)

        images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
        embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
        phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")
        embedding_size = embeddings.get_shape()[1]

        classifier_filename = './myclassifier/my_classifier.pkl'
        classifier_filename_exp = os.path.expanduser(classifier_filename)
        with open(classifier_filename_exp, 'rb') as infile:
            (model, class_names) = pickle.load(infile)
            print('load classifier file-> %s' % type(class_names))
        HumanNames = class_names
        video_capture = cv2.VideoCapture(0)
        c = 0

        print('Start!')
        prevTime = 0
        while True:
            ret, frame = video_capture.read()

            # frame = cv2.resize(frame, (0,0), fx=0.5, fy=0.5)    #resize frame (optional)

            curTime = time.time()    # calc fps
            timeF = frame_interval

            if (c % timeF == 0):
                find_results = []

                if frame.ndim == 2:
                    frame = facenet.to_rgb(frame)
                frame = frame[:, :, 0:3]
                bounding_boxes, _ = detect_face.detect_face(frame, minsize, pnet, rnet, onet, threshold, factor)
                nrof_faces = bounding_boxes.shape[0]
                # print('Bounding Boxes: ', bounding_boxes, 'Shape: ', bounding_boxes.shape, 'nrof_faces:: ', nrof_faces)
                # print('Detected_FaceNum: %d' % nrof_faces)

                if nrof_faces > 0:
                    detect(frame)
                    label = classify(frame)
                    if label == "a":
                        det = bounding_boxes[:, 0:4]
                        img_size = np.asarray(frame.shape)[0:2]
    
                        cropped = []
                        scaled = []
                        scaled_reshape = []
                        bb = np.zeros((nrof_faces,4), dtype=np.int32)
    
                        for i in range(nrof_faces):
                            emb_array = np.zeros((1, embedding_size))
                            # print("Embeddinigs:::::")
                            # print(emb_array)
                            # print("Embeddinigs:::::")
                            bb[i][0] = det[i][0]
                            bb[i][1] = det[i][1]
                            bb[i][2] = det[i][2]
                            bb[i][3] = det[i][3]
    
                            if bb[i][0] <= 0 or bb[i][1] <= 0 or bb[i][2] >= len(frame[0]) or bb[i][3] >= len(frame):
                                print('face is inner of range!')
                                continue
    
                            cropped.append(frame[bb[i][1]:bb[i][3], bb[i][0]:bb[i][2], :])
                            cropped[0] = facenet.flip(cropped[0], False)
                            scaled.append(misc.imresize(cropped[0], (image_size, image_size), interp='bilinear'))
                            scaled[0] = cv2.resize(scaled[0], (input_image_size,input_image_size),
                                                   interpolation=cv2.INTER_CUBIC)
                            scaled[0] = facenet.prewhiten(scaled[0])
                            scaled_reshape.append(scaled[0].reshape(-1,input_image_size,input_image_size,3))
                            feed_dict = {images_placeholder: scaled_reshape[0], phase_train_placeholder: False}
                            emb_array[0, :] = sess.run(embeddings, feed_dict=feed_dict)

                            predictions = model.predict_proba(emb_array)
                            best_class_indices = np.argmax(predictions, axis=1)
                            best_class_probabilities = predictions[np.arange(len(best_class_indices)), best_class_indices]
                            cv2.rectangle(frame, (bb[i][0], bb[i][1]), (bb[i][2], bb[i][3]), (0, 255, 0), 2)
                            text_x = bb[i][0]
                            text_y = bb[i][3] + 20

（编辑）错误：

Traceback (most recent call last):
  File "realtime.py", line 105, in <module>
    label = classify(frame)
  File "yolov5-master\myutils.py", line 117, in classify
    pred = model(img)[0]
  File "C:\Users\Anuj\anaconda3\envs\py36\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "yolov5-master\models\yolo.py", line 122, in forward
    return self.forward_once(x, profile)  # single-scale inference, train
  File "yolov5-master\models\yolo.py", line 138, in forward_once
    x = m(x)  # run
  File "C:\Users\Anuj\anaconda3\envs\py36\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "yolov5-master\models\common.py", line 94, in forward
    return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1))
  File "C:\Users\Anuj\anaconda3\envs\py36\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "yolov5-master\models\common.py", line 38, in fuseforward
    return self.act(self.conv(x))
  File "C:\Users\Anuj\anaconda3\envs\py36\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "C:\Users\Anuj\anaconda3\envs\py36\lib\site-packages\torch\nn\modules\conv.py", line 419, in forward
    return self._conv_forward(input, self.weight)
  File "C:\Users\Anuj\anaconda3\envs\py36\lib\site-packages\torch\nn\modules\conv.py", line 416, in _conv_forward
    self.padding, self.dilation, self.groups)
RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

Answer 1

您需要将输入张量发送到您的设备，而不是其结果：

pred = model(img.to(device))[0]

作为旁注，我要指出的是 x.to(device) 作为表达式对张量的位置没有影响。而是用 x = x.to(device):

重新分配张量

>>> x = torch.ones(1)
>>> x.to(device)
tensor([1], device='cuda:0')
>>> x.is_cuda
False

这不适用于 nn.Module，调用 model.to(device) 就足够了。

编辑 - 在 classify 中，您将模型发送回 cpu，您在 img 上调用它。由于您在循环中调用 classify，因此第一个转发会起作用，而后面的调用不会。

如何使输入类型和权重类型相同？

How do I make Input type and weight type same?

python

gpu

computer-vision

deep-learning

pytorch