diff --git a/TM.py b/TM.py index 2afe51e..1420770 100644 --- a/TM.py +++ b/TM.py @@ -134,65 +134,45 @@ class HandDetector: class Main: def __init__(self): self.detector = None - self.camera = cv2.VideoCapture(0, cv2.CAP_DSHOW) - self.camera.set(3, 1280) - self.camera.set(4, 720) + self.camera = None + # self.camera = cv2.VideoCapture(0, cv2.CAP_DSHOW) + # self.camera.set(3, 1280) + # self.camera.set(4, 720) - def gesture_recognition(self): - self.detector = HandDetector() - while True: - frame, img = self.camera.read() - img = self.detector.find_hands(img) - lm_list, bbox = self.detector.find_position(img) + def gesture_recognition(self, img, detector): + self.detector = detector + lm_list, bbox = detector.find_position(img) - if lm_list: - x_1, y_1 = bbox["bbox"][0], bbox["bbox"][1] - x1, x2, x3, x4, x5 = self.detector.fingers_up() - if (np.linalg.norm(lm_list[4]-lm_list[8]) < 50) and (np.linalg.norm(lm_list[4]-lm_list[12]) < 50): - cv2.putText(img, "7_SEVEN", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - elif (np.linalg.norm(lm_list[4]-lm_list[8]) < 50) and (x4 == 1 and x5 == 1 and x3 == 1): - cv2.putText(img, "OK", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - elif (np.linalg.norm(lm_list[4]-lm_list[12]) < 50) and (x4 == 1 and x5 == 1 and x2 == 1): - cv2.putText(img, "flip", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - elif (x2 == 1 and x3 == 1) and (x4 == 0 and x5 == 0 and x1 == 0): - cv2.putText(img, "2_TWO", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - elif (x2 == 1 and x3 == 1 and x4 == 1) and (x1 == 0 and x5 == 0): - cv2.putText(img, "3_THREE", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - elif (x2 == 1 and x3 == 1 and x4 == 1 and x5 == 1) and (x1 == 0): - cv2.putText(img, "4_FOUR", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - elif x1 == 1 and x2 == 1 and x3 == 1 and x4 == 1 and x5 == 1: - cv2.putText(img, "5_FIVE", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - elif (x2 == 1 and x1 == 0) and (x3 == 0 and x4 == 0 and x5 == 0): - cv2.putText(img, "1_ONE", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - elif (x1 == 1 and x2 == 1) and (x3 == 0 and x4 == 0 and x5 == 0): - cv2.putText(img, "8_EIGHT", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - elif (x1 == 1 and x5 == 1) and (x3 == 0 and x4 == 0 and x2 == 0): - cv2.putText(img, "6_SIX", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - elif x1 and (x2 == 0 and x3 == 0 and x4 == 0 and x5 == 0): - cv2.putText(img, "GOOD!", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - elif (x1 == 1 and x2 == 1 and x5 == 1) and (x3 == 0 and x4 == 0): - cv2.putText(img, "yo", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - else: - cv2.putText(img, "unknown", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - cv2.imshow("camera", img) - key = cv2.waitKey(1) - if cv2.getWindowProperty('camera', cv2.WND_PROP_VISIBLE) < 1: - break - elif key == 27: - break + if lm_list: + x_1, y_1 = bbox["bbox"][0], bbox["bbox"][1] + x1, x2, x3, x4, x5 = detector.fingers_up() + if (np.linalg.norm(lm_list[4]-lm_list[8]) < 50) and (np.linalg.norm(lm_list[4]-lm_list[12]) < 50): + cv2.putText(img, "7_SEVEN", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, + (0, 0, 255), 3) + elif (x2 == 1 and x3 == 1) and (x4 == 0 and x5 == 0 and x1 == 0): + cv2.putText(img, "2_TWO", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, + (0, 0, 255), 3) + elif (x2 == 1 and x3 == 1 and x4 == 1) and (x1 == 0 and x5 == 0): + cv2.putText(img, "3_THREE", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, + (0, 0, 255), 3) + elif (x2 == 1 and x3 == 1 and x4 == 1 and x5 == 1) and (x1 == 0): + cv2.putText(img, "4_FOUR", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, + (0, 0, 255), 3) + elif x1 == 1 and x2 == 1 and x3 == 1 and x4 == 1 and x5 == 1: + cv2.putText(img, "5_FIVE", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, + (0, 0, 255), 3) + elif (x2 == 1 and x1 == 0) and (x3 == 0 and x4 == 0 and x5 == 0): + cv2.putText(img, "1_ONE", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, + (0, 0, 255), 3) + elif (x1 == 1 and x2 == 1) and (x3 == 0 and x4 == 0 and x5 == 0): + cv2.putText(img, "8_EIGHT", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, + (0, 0, 255), 3) + elif (x1 == 1 and x5 == 1) and (x3 == 0 and x4 == 0 and x2 == 0): + cv2.putText(img, "6_SIX", (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, + (0, 0, 255), 3) + else: + return 1 + return 0 if __name__ == '__main__': diff --git a/ai.py b/ai.py index 5bcd6f2..60f445f 100644 --- a/ai.py +++ b/ai.py @@ -6,13 +6,12 @@ @ by: Leaf @ date: 2022-05-28 """ - +import tkinter as tk import cv2 import mediapipe as mp import torch import torch.nn as nn import numpy as np -import tkinter as tk import shutil import math from scipy import stats @@ -33,7 +32,7 @@ def rotate(angle, x, y, point_x, point_y): def normalize(x): max_x = np.max(x) min_x = np.min(x) - return (x-min_x)/(max_x-min_x) + return (x - min_x) / (max_x - min_x) class CNN(nn.Module): @@ -57,7 +56,7 @@ class CNN(nn.Module): nn.MaxPool2d(2), ) self.med = nn.Linear(32 * 11 * 2, 500) - self.med2 = nn.Linear(1*21*3, 100) + self.med2 = nn.Linear(1 * 21 * 3, 100) self.med3 = nn.Linear(100, 500) self.out = nn.Linear(500, m) # fully connected layer, output 10 classes @@ -285,6 +284,23 @@ class Main: self.len_x = 22 self.len_y = 4 self.label = '' + self.result = [] + self.disp = "" + + def change_state(self): + self.label = self.entry.get() # 调用get()方法,将Entry中的内容获取出来 + self.top1.quit() + if self.label == "": + self.top1.destroy() + + def make_datasets(self, camera, datasets_dir="default", n=100): + if datasets_dir == "default": + return + if exists(datasets_dir): + shutil.rmtree(datasets_dir) + mkdir(datasets_dir) + self.camera = camera + self.top1 = tk.Tk() self.top1.geometry('300x50') self.top1.title('请输入标签') @@ -293,22 +309,6 @@ class Main: self.entry.place(x=80, y=10) tk.Button(self.top1, text='确定', command=self.change_state).place(x=235, y=5) - def change_state(self): - self.label = self.entry.get() # 调用get()方法,将Entry中的内容获取出来 - self.top1.quit() - if self.label == "": - self.top1.destroy() - - def make_datasets(self, datasets_dir="default", n=100): - if datasets_dir == "default": - return - if exists(datasets_dir): - shutil.rmtree(datasets_dir) - mkdir(datasets_dir) - if self.camera is None: - self.camera = cv2.VideoCapture(0, cv2.CAP_DSHOW) - self.camera.set(3, 1280) - self.camera.set(4, 720) self.top1.mainloop() while not self.label == "": data = np.zeros([n, self.len_x, self.len_y]) @@ -349,7 +349,6 @@ class Main: open(datasets_dir + "/" + self.label + ".npz", "w") np.savez(datasets_dir + "/" + self.label + ".npz", label=self.label, data=data, handtype=hand_type, shape=shape_list) - self.top1.mainloop() def train(self, datasets_dir="default"): @@ -359,21 +358,75 @@ class Main: ai.load_datasets() ai.train_cnn() - def gesture_recognition(self): - if self.camera is None: - self.camera = cv2.VideoCapture(0, cv2.CAP_DSHOW) - self.camera.set(3, 1280) - self.camera.set(4, 720) + def gesture_recognition_camera(self, detector, img, cnn): + self.detector = detector + out_label = cnn.out_label + img = self.detector.find_hands(img) + lm_list, bbox = self.detector.find_position(img) + + if lm_list.any(): + x_1, y_1 = bbox["bbox"][0], bbox["bbox"][1] + data = torch.Tensor(lm_list) + data = data.unsqueeze(0) + data = data.unsqueeze(0) + + test_output = cnn(data) + self.result.append(torch.max(test_output, 1)[1].data.cpu().numpy()[0]) + if len(self.result) > 5: + self.disp = str(out_label[stats.mode(self.result)[0][0]]) + self.result = [] + + cv2.putText(img, self.disp, (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, + (0, 0, 255), 3) + + def gesture_recognition_video(self, filedir): self.detector = HandDetector() cnn = torch.load("CNN.pkl") out_label = cnn.out_label result = [] disp = "" + cap = cv2.VideoCapture(filedir) while True: - frame, img = self.camera.read() + ret, img = cap.read() img = self.detector.find_hands(img) lm_list, bbox = self.detector.find_position(img) + if lm_list.any(): + x_1, y_1 = bbox["bbox"][0], bbox["bbox"][1] + data = torch.Tensor(lm_list) + data = data.unsqueeze(0) + data = data.unsqueeze(0) + + test_output = cnn(data) + result.append(torch.max(test_output, 1)[1].data.cpu().numpy()[0]) + + if len(result) > 5: + disp = str(out_label[stats.mode(result)[0][0]]) + result = [] + + cv2.putText(img, disp, (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, + (0, 0, 255), 3) + + cv2.imshow("camera", img) + key = cv2.waitKey(1) + if cv2.getWindowProperty('camera', cv2.WND_PROP_VISIBLE) < 1: + break + elif key == 27: + break + cap.release() + + def gesture_recognition_img(self, filedir): + self.detector = HandDetector() + cnn = torch.load("CNN.pkl") + out_label = cnn.out_label + result = [] + disp = "" + img = cv2.imread(filedir) + img = self.detector.find_hands(img) + while True: + + lm_list, bbox = self.detector.find_position(img) + if lm_list.any(): x_1, y_1 = bbox["bbox"][0], bbox["bbox"][1] data = torch.Tensor(lm_list) @@ -402,4 +455,8 @@ if __name__ == '__main__': my_datasets_dir = "test" solution.make_datasets(my_datasets_dir, 100) solution.train(my_datasets_dir) - solution.gesture_recognition() + dir_video = "C:/Users/Liar/Pictures/Camera Roll/WIN_20220630_20_11_47_Pro.mp4" + dir_img = "C:/Users/Liar/Pictures/Camera Roll/WIN_20220630_20_01_22_Pro.jpg" + # solution.gesture_recognition_camera() + # solution.gesture_recognition_video(dir_video) + # solution.gesture_recognition_img(dir_img) diff --git a/ai_UI.py b/ai_UI.py deleted file mode 100644 index a97a697..0000000 --- a/ai_UI.py +++ /dev/null @@ -1,476 +0,0 @@ -# -*- coding:utf-8 -*- - -""" -信号设计课程小组设计 - -@ by: Leaf -@ date: 2022-05-28 -""" -import tkinter as tk -import cv2 -import mediapipe as mp -import torch -import torch.nn as nn -import numpy as np -import shutil -import math -from scipy import stats -from os.path import exists -from os import mkdir -from pathlib import Path -from torch.utils.data import DataLoader, TensorDataset - - -# 旋转函数 -def rotate(angle, x, y, point_x, point_y): - px = (x - point_x) * math.cos(angle) - (y - point_y) * math.sin(angle) + point_x - py = (x - point_x) * math.sin(angle) + (y - point_y) * math.cos(angle) + point_y - return px, py - - -# 归一化 -def normalize(x): - max_x = np.max(x) - min_x = np.min(x) - return (x - min_x) / (max_x - min_x) - - -class CNN(nn.Module): - def __init__(self, m): - super(CNN, self).__init__() - self.out_label = [] - self.conv1 = nn.Sequential( - nn.Conv2d( - in_channels=1, - out_channels=16, - kernel_size=5, - stride=1, - padding=2, - ), - nn.ReLU(), - nn.MaxPool2d(kernel_size=1), - ) - self.conv2 = nn.Sequential( - nn.Conv2d(16, 32, 5, 1, 2), - nn.ReLU(), - nn.MaxPool2d(2), - ) - self.med = nn.Linear(32 * 11 * 2, 500) - self.med2 = nn.Linear(1 * 21 * 3, 100) - self.med3 = nn.Linear(100, 500) - self.out = nn.Linear(500, m) # fully connected layer, output 10 classes - - def forward(self, x): - x = self.conv1(x) - x = self.conv2(x) - x = x.view(x.size(0), -1) # 展平多维的卷积图成 (batch_size, 32 * 7 * 7) - x = self.med(x) - # x = self.med2(x) - # x = self.med3(x) - output = self.out(x) - return output - - -class HandDetector: - """ - 使用mediapipe库查找手。导出地标像素格式。添加了额外的功能。 - 如查找方式,许多手指向上或两个手指之间的距离。而且提供找到的手的边界框信息。 - """ - - def __init__(self, mode=False, max_hands=2, detection_con=0.5, min_track_con=0.5): - """ - :param mode: 在静态模式下,对每个图像进行检测 - :param max_hands: 要检测的最大手数 - :param detection_con: 最小检测置信度 - :param min_track_con: 最小跟踪置信度 - """ - self.results = None - self.mode = mode - self.max_hands = max_hands - self.modelComplex = 1 - self.detection_con = detection_con - self.min_track_con = min_track_con - - # 初始化手部的识别模型 - self.mpHands = mp.solutions.hands - self.hands = self.mpHands.Hands(static_image_mode=self.mode, - max_num_hands=self.max_hands, - min_detection_confidence=self.detection_con, - min_tracking_confidence=self.min_track_con) - self.mpDraw = mp.solutions.drawing_utils # 初始化绘图器 - self.tipIds = [4, 8, 12, 16, 20] # 指尖列表 - self.fingers = [] - self.lmList = [] - self.re_lmList = [] - - def find_hands(self, img, draw=True): - """ - 从图像(BRG)中找到手部。 - :param img: 用于查找手的图像。 - :param draw: 在图像上绘制输出的标志。 - :return: 带或不带图形的图像 - """ - img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # 将传入的图像由BGR模式转标准的Opencv模式——RGB模式, - self.results = self.hands.process(img_rgb) - - if self.results.multi_hand_landmarks: - for handLms in self.results.multi_hand_landmarks: - if draw: - self.mpDraw.draw_landmarks(img, handLms, - self.mpHands.HAND_CONNECTIONS) - return img - - def find_position(self, img, hand_no=0, draw=True): - """ - 查找单手的地标并将其放入列表中像素格式。还可以返回手部的周围的边界框。 - :param img: 要查找的主图像 - :param hand_no: 如果检测到多只手,则为手部id - :param draw: 在图像上绘制输出的标志。(默认绘制矩形框) - :return: 像素格式的手部关节位置列表;手部边界框 - """ - x_list = [] - y_list = [] - bbox_info = [] - self.lmList = [] - h, w, c = img.shape - if self.results.multi_hand_landmarks: - my_hand = self.results.multi_hand_landmarks[hand_no] - for i, lm in enumerate(my_hand.landmark): - px, py = int(lm.x * w), int(lm.y * h) - x_list.append(px) - y_list.append(py) - self.lmList.append([lm.x, lm.y, 0]) - if draw: - cv2.circle(img, (px, py), 5, (255, 0, 255), cv2.FILLED) - x_min, x_max = min(x_list), max(x_list) - y_min, y_max = min(y_list), max(y_list) - box_w, box_h = x_max - x_min, y_max - y_min - bbox = x_min, y_min, box_w, box_h - cx, cy = bbox[0] + (bbox[2] // 2), bbox[1] + (bbox[3] // 2) - bbox_info = {"id": hand_no, "bbox": bbox, "center": (cx, cy), "shape": (h, w)} - - if draw: - cv2.rectangle(img, (bbox[0] - 20, bbox[1] - 20), - (bbox[0] + bbox[2] + 20, bbox[1] + bbox[3] + 20), - (0, 255, 0), 2) - - self.revolve(img) - self.re_lmList = np.array(self.re_lmList) - if self.re_lmList.any(): - self.re_lmList = np.concatenate((np.zeros((21, 1)), self.re_lmList), axis=1) - self.re_lmList = np.concatenate((self.re_lmList, np.zeros((1, 4))), axis=0) - - return self.re_lmList, bbox_info - - def revolve(self, img, draw=True): - """ - 旋转手势识别点 - :param img: 要查找的主图像 - :param draw: 在图像上绘制输出的标志。(默认绘制矩形框) - :return: 像素格式的手部关节位置列表 - """ - h, w, c = img.shape - if len(self.lmList) >= 21: - # print(self.lmList) - self.re_lmList = [] - point_x = self.lmList[0][0] - point_y = self.lmList[0][1] - delta_x = self.lmList[13][0] - point_x - delta_y = self.lmList[13][1] - point_y - if delta_y == 0: - if delta_x < 0: - theta = math.pi / 2 - else: - theta = -math.pi / 2 - else: - theta = math.atan(delta_x / delta_y) - if delta_y > 0: - theta = theta + math.pi - # print(theta*180/math.pi) - for i in self.lmList: - px, py = rotate(theta, i[0] * w, i[1] * h, point_x * w, point_y * h) - self.re_lmList.append([px, py, 0]) - if draw: - cv2.circle(img, (int(px), int(py)), 5, (0, 0, 255), cv2.FILLED) - # 归一化 - x_array = normalize(np.array(self.re_lmList)[:, 0]) - # print(x_array) - for i in range(len(x_array)): - self.re_lmList[i][0] = x_array[i] - y_array = normalize(np.array(self.re_lmList)[:, 1]) - for i in range(len(y_array)): - self.re_lmList[i][1] = x_array[i] - else: - self.re_lmList = self.lmList - return self.re_lmList - - def hand_type(self): - """ - 检查传入的手部 是左还是右 - :return: 1 或 0 - """ - if self.results.multi_hand_landmarks: - if self.lmList[17][0] < self.lmList[5][0]: - return 1 - else: - return 0 - - -class AI: - def __init__(self, datasets_dir): - self.EPOCH = 20 - self.BATCH_SIZE = 2 - self.LR = 10e-5 - self.DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.datasets_dir = datasets_dir - self.train_loader = None - self.m = 0 - self.out_label = [] # CNN网络输出后数字标签转和字符串标签的映射关系 - - def load_datasets(self): - train_data = [] - train_label = [] - self.m = 0 - for file in Path(self.datasets_dir).rglob("*.npz"): - data = np.load(str(file)) - train_data.append(data["data"]) - label_number = np.ones(len(data["data"])) * len(self.out_label) - train_label.append(label_number) - self.out_label.append(data["label"]) - self.m += 1 - train_data = torch.Tensor(np.concatenate(train_data, axis=0)) - train_data = train_data.unsqueeze(1) - train_label = torch.tensor(np.concatenate(train_label, axis=0)).long() - - dataset = TensorDataset(train_data, train_label) - self.train_loader = DataLoader(dataset, batch_size=self.BATCH_SIZE, shuffle=True) - return self.m - - def train_cnn(self): - cnn = CNN(self.m).to(self.DEVICE) - optimizer = torch.optim.Adam(cnn.parameters(), self.LR) # optimize all cnn parameters - loss_func = nn.CrossEntropyLoss() # the target label is not one-hotted - - for epoch in range(self.EPOCH): - for step, (data, target) in enumerate(self.train_loader): - # 分配 batch data, normalize x when iterate train_loader - data, target = data.to(self.DEVICE), target.to(self.DEVICE) - output = cnn(data) # cnn output - loss = loss_func(output, target) # cross entropy loss - optimizer.zero_grad() # clear gradients for this training step - loss.backward() # backpropagation, compute gradients - optimizer.step() # apply gradients - if (step + 1) % 50 == 0: # 输出结果 - print( - "\r[Epoch: %d] [%d/%d (%0.f %%)][Loss: %f]" - % ( - epoch + 1, - (step + 1) * len(data), - len(self.train_loader.dataset), - 100. * (step + 1) / len(self.train_loader), - loss.item() - ), end="") - - cnn.out_label = self.out_label - torch.save(cnn, 'CNN.pkl') - print("训练结束") - - -class Main: - def __init__(self): - self.camera = None - self.detector = HandDetector() - self.default_datasets = "Datasets" - self.len_x = 22 - self.len_y = 4 - self.label = '' - self.top1 = tk.Tk() - self.top1.geometry('300x50') - self.top1.title('请输入标签') - tk.Label(self.top1, text='Label:').place(x=27, y=10) - self.entry = tk.Entry(self.top1, width=15) - self.entry.place(x=80, y=10) - tk.Button(self.top1, text='确定', command=self.change_state).place(x=235, y=5) - - def change_state(self): - self.label = self.entry.get() # 调用get()方法,将Entry中的内容获取出来 - self.top1.quit() - if self.label == "": - self.top1.destroy() - - def make_datasets(self, datasets_dir="default", n=100): - if datasets_dir == "default": - return - if exists(datasets_dir): - shutil.rmtree(datasets_dir) - mkdir(datasets_dir) - if self.camera is None: - self.camera = cv2.VideoCapture(0, cv2.CAP_DSHOW) - self.camera.set(3, 1280) - self.camera.set(4, 720) - self.top1.mainloop() - while not self.label == "": - data = np.zeros([n, self.len_x, self.len_y]) - shape_list = np.zeros([n, 2], dtype=np.int16) - hand_type = np.zeros(n, dtype=np.int8) - - count = 0 - cv2.startWindowThread() - while True: - frame, img = self.camera.read() - img = self.detector.find_hands(img) - result = np.zeros((self.len_x, self.len_y)) - - lm_list, bbox = self.detector.find_position(img) - for i in range(len(lm_list)): - result[i] = np.array(lm_list[i]) - if result.sum() > 0: # 假设矩阵不为0,即捕捉到手部时 - - shape = bbox["shape"] - x_1, y_1 = bbox["bbox"][0], bbox["bbox"][1] - data[count] = result - hand_type[count] = self.detector.hand_type() - shape_list[count] = np.array(shape) - count += 1 - cv2.putText(img, str("{}/{}".format(count, n)), (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 255, 0), 3) - - cv2.imshow("camera", img) - key = cv2.waitKey(100) - if cv2.getWindowProperty('camera', cv2.WND_PROP_VISIBLE) < 1: - break - elif key == 27: - break - elif count == n - 1: - break - cv2.destroyAllWindows() - - open(datasets_dir + "/" + self.label + ".npz", "w") - np.savez(datasets_dir + "/" + self.label + ".npz", label=self.label, data=data, - handtype=hand_type, shape=shape_list) - self.top1.mainloop() - - def train(self, datasets_dir="default"): - if datasets_dir == "default": - datasets_dir = self.default_datasets - ai = AI(datasets_dir) - ai.load_datasets() - ai.train_cnn() - - def gesture_recognition_camera(self): - if self.camera is None: - self.camera = cv2.VideoCapture(0, cv2.CAP_DSHOW) - self.camera.set(3, 1280) - self.camera.set(4, 720) - self.detector = HandDetector() - cnn = torch.load("CNN.pkl") - out_label = cnn.out_label - result = [] - disp = "" - while True: - frame, img = self.camera.read() - img = self.detector.find_hands(img) - lm_list, bbox = self.detector.find_position(img) - - if lm_list.any(): - x_1, y_1 = bbox["bbox"][0], bbox["bbox"][1] - data = torch.Tensor(lm_list) - data = data.unsqueeze(0) - data = data.unsqueeze(0) - - test_output = cnn(data) - result.append(torch.max(test_output, 1)[1].data.cpu().numpy()[0]) - if len(result) > 5: - disp = str(out_label[stats.mode(result)[0][0]]) - result = [] - - cv2.putText(img, disp, (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - - cv2.imshow("camera", img) - key = cv2.waitKey(1) - if cv2.getWindowProperty('camera', cv2.WND_PROP_VISIBLE) < 1: - break - elif key == 27: - break - - def gesture_recognition_video(self, filedir): - self.detector = HandDetector() - cnn = torch.load("CNN.pkl") - out_label = cnn.out_label - result = [] - disp = "" - cap = cv2.VideoCapture(filedir) - while True: - ret, img = cap.read() - img = self.detector.find_hands(img) - lm_list, bbox = self.detector.find_position(img) - - if lm_list.any(): - x_1, y_1 = bbox["bbox"][0], bbox["bbox"][1] - data = torch.Tensor(lm_list) - data = data.unsqueeze(0) - data = data.unsqueeze(0) - - test_output = cnn(data) - result.append(torch.max(test_output, 1)[1].data.cpu().numpy()[0]) - if len(result) > 5: - disp = str(out_label[stats.mode(result)[0][0]]) - result = [] - - cv2.putText(img, disp, (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - - cv2.imshow("camera", img) - key = cv2.waitKey(1) - if cv2.getWindowProperty('camera', cv2.WND_PROP_VISIBLE) < 1: - break - elif key == 27: - break - cap.release() - - - def gesture_recognition_img(self, filedir): - self.detector = HandDetector() - cnn = torch.load("CNN.pkl") - out_label = cnn.out_label - result = [] - disp = "" - img = cv2.imread(filedir) - img = self.detector.find_hands(img) - while True: - - lm_list, bbox = self.detector.find_position(img) - - if lm_list.any(): - x_1, y_1 = bbox["bbox"][0], bbox["bbox"][1] - data = torch.Tensor(lm_list) - data = data.unsqueeze(0) - data = data.unsqueeze(0) - - test_output = cnn(data) - result.append(torch.max(test_output, 1)[1].data.cpu().numpy()[0]) - if len(result) > 5: - disp = str(out_label[stats.mode(result)[0][0]]) - result = [] - - cv2.putText(img, disp, (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - - cv2.imshow("camera", img) - key = cv2.waitKey(1) - if cv2.getWindowProperty('camera', cv2.WND_PROP_VISIBLE) < 1: - break - elif key == 27: - break - -if __name__ == '__main__': - solution = Main() - my_datasets_dir = "test" - # solution.make_datasets(my_datasets_dir, 100) - # solution.train(my_datasets_dir) - dir_video = "C:/Users/Liar/Pictures/Camera Roll/WIN_20220630_20_11_47_Pro.mp4" - dir_img = "C:/Users/Liar/Pictures/Camera Roll/WIN_20220630_20_01_22_Pro.jpg" - solution.gesture_recognition_camera() - # solution.gesture_recognition_video(dir_video) - # solution.gesture_recognition_img(dir_img) diff --git a/ai-two.py b/ai_two.py similarity index 86% rename from ai-two.py rename to ai_two.py index 70568bc..c0a1212 100644 --- a/ai-two.py +++ b/ai_two.py @@ -36,9 +36,9 @@ def normalize(x): return (x-min_x)/(max_x-min_x) -class CNN(nn.Module): +class CNNTwo(nn.Module): def __init__(self, m): - super(CNN, self).__init__() + super(CNNTwo, self).__init__() self.out_label = [] self.conv1 = nn.Sequential( nn.Conv2d( @@ -250,7 +250,7 @@ class AI: return self.m def train_cnn(self): - cnn = CNN(self.m).to(self.DEVICE) + cnn = CNNTwo(self.m).to(self.DEVICE) optimizer = torch.optim.Adam(cnn.parameters(), self.LR) # optimize all cnn parameters loss_func = nn.CrossEntropyLoss() # the target label is not one-hotted @@ -275,7 +275,7 @@ class AI: ), end="") cnn.out_label = self.out_label - torch.save(cnn, 'CNN.pkl') + torch.save(cnn, 'CNN_two.pkl') print("训练结束") @@ -287,13 +287,9 @@ class Main: self.len_x = 44 self.len_y = 4 self.label = '' - self.top1 = tk.Tk() - self.top1.geometry('300x50') - self.top1.title('请输入标签') - tk.Label(self.top1, text='Label:').place(x=27, y=10) - self.entry = tk.Entry(self.top1, width=15) - self.entry.place(x=80, y=10) - tk.Button(self.top1, text='确定', command=self.change_state).place(x=235, y=5) + + self.result = [] + self.disp = "" def change_state(self): self.label = self.entry.get() # 调用get()方法,将Entry中的内容获取出来 @@ -301,16 +297,27 @@ class Main: if self.label == "": self.top1.destroy() - def make_datasets(self, datasets_dir="default", n=100): + def on_closing(self): + self.label = "" + self.top1.destroy() + + def make_datasets(self, camera, datasets_dir="default", n=100): if datasets_dir == "default": return if exists(datasets_dir): shutil.rmtree(datasets_dir) mkdir(datasets_dir) - if self.camera is None: - self.camera = cv2.VideoCapture(0, cv2.CAP_DSHOW) - self.camera.set(3, 1280) - self.camera.set(4, 720) + self.camera = camera + + self.top1 = tk.Tk() + self.top1.geometry('300x50') + self.top1.title('请输入标签') + self.top1.protocol("WM_DELETE_WINDOW", self.on_closing) + tk.Label(self.top1, text='Label:').place(x=27, y=10) + self.entry = tk.Entry(self.top1, width=15) + self.entry.place(x=80, y=10) + tk.Button(self.top1, text='确定', command=self.change_state).place(x=235, y=5) + self.top1.mainloop() while not self.label == "": data = np.zeros([n, self.len_x, self.len_y]) @@ -369,47 +376,34 @@ class Main: ai.load_datasets() ai.train_cnn() - def gesture_recognition(self): - if self.camera is None: - self.camera = cv2.VideoCapture(0, cv2.CAP_DSHOW) - self.camera.set(3, 1280) - self.camera.set(4, 720) - self.detector = HandDetector() - cnn = torch.load("CNN.pkl") + def gesture_recognition(self, detector, img, cnn): + self.detector = detector out_label = cnn.out_label - result = [] - disp = "" - while True: - frame, img = self.camera.read() - img, is_two_hand = self.detector.find_hands(img) - if is_two_hand: - lm_list1, bbox1 = self.detector.find_position(img, 0) - lm_list2, bbox2 = self.detector.find_position(img, 1) - if lm_list1.any() and lm_list2.any(): - x_1, y_1 = bbox1["bbox"][0], bbox1["bbox"][1] - x_2, y_2 = bbox2["bbox"][0], bbox2["bbox"][1] - lm_list = np.concatenate((lm_list1, lm_list2), axis=0) - data = torch.Tensor(lm_list) - data = data.unsqueeze(0) - data = data.unsqueeze(0) + img, is_two_hand = self.detector.find_hands(img) + if is_two_hand: + lm_list1, bbox1 = self.detector.find_position(img, 0) + lm_list2, bbox2 = self.detector.find_position(img, 1) + if lm_list1.any() and lm_list2.any(): + x_1, y_1 = bbox1["bbox"][0], bbox1["bbox"][1] + x_2, y_2 = bbox2["bbox"][0], bbox2["bbox"][1] + lm_list = np.concatenate((lm_list1, lm_list2), axis=0) + data = torch.Tensor(lm_list) + data = data.unsqueeze(0) + data = data.unsqueeze(0) - test_output = cnn(data) - result.append(torch.max(test_output, 1)[1].data.cpu().numpy()[0]) - if len(result) > 4: - disp = str(out_label[stats.mode(result)[0][0]]) - result = [] + test_output = cnn(data) + self.result.append(torch.max(test_output, 1)[1].data.cpu().numpy()[0]) + if len(self.result) > 4: + self.disp = str(out_label[stats.mode(self.result)[0][0]]) + self.result = [] - cv2.putText(img, disp, (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - cv2.putText(img, disp, (x_2, y_2), cv2.FONT_HERSHEY_PLAIN, 3, - (0, 0, 255), 3) - - cv2.imshow("camera", img) - key = cv2.waitKey(1) - if cv2.getWindowProperty('camera', cv2.WND_PROP_VISIBLE) < 1: - break - elif key == 27: - break + cv2.putText(img, self.disp, (x_1, y_1), cv2.FONT_HERSHEY_PLAIN, 3, + (0, 0, 255), 3) + cv2.putText(img, self.disp, (x_2, y_2), cv2.FONT_HERSHEY_PLAIN, 3, + (0, 0, 255), 3) + else: + return 1 + return 0 if __name__ == '__main__': diff --git a/gr.py b/gr.py new file mode 100644 index 0000000..d485359 --- /dev/null +++ b/gr.py @@ -0,0 +1,165 @@ +import TM +import ai +import ai_two +import cv2 +import torch +import torch.nn as nn + + +class CNN(nn.Module): + def __init__(self, m): + super(CNN, self).__init__() + self.out_label = [] + self.conv1 = nn.Sequential( + nn.Conv2d( + in_channels=1, + out_channels=16, + kernel_size=5, + stride=1, + padding=2, + ), + nn.ReLU(), + nn.MaxPool2d(kernel_size=1), + ) + self.conv2 = nn.Sequential( + nn.Conv2d(16, 32, 5, 1, 2), + nn.ReLU(), + nn.MaxPool2d(2), + ) + self.med = nn.Linear(32 * 11 * 2, 500) + self.med2 = nn.Linear(1 * 21 * 3, 100) + self.med3 = nn.Linear(100, 500) + self.out = nn.Linear(500, m) # fully connected layer, output 10 classes + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = x.view(x.size(0), -1) # 展平多维的卷积图成 (batch_size, 32 * 7 * 7) + x = self.med(x) + # x = self.med2(x) + # x = self.med3(x) + output = self.out(x) + return output + + +class CNNTwo(nn.Module): + def __init__(self, m): + super(CNNTwo, self).__init__() + self.out_label = [] + self.conv1 = nn.Sequential( + nn.Conv2d( + in_channels=1, + out_channels=16, + kernel_size=5, + stride=1, + padding=2, + ), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2), + ) + self.conv2 = nn.Sequential( + nn.Conv2d(16, 32, 5, 1, 2), + nn.ReLU(), + nn.MaxPool2d(2), + ) + self.med = nn.Linear(32 * 11 * 1, 500) + self.med2 = nn.Linear(1*21*3, 100) + self.med3 = nn.Linear(100, 500) + self.out = nn.Linear(500, m) # fully connected layer, output 10 classes + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = x.view(x.size(0), -1) # 展平多维的卷积图成 (batch_size, 32 * 7 * 7) + x = self.med(x) + # x = self.med2(x) + # x = self.med3(x) + output = self.out(x) + return output + + +class Main: + def __init__(self): + self.camera = cv2.VideoCapture(0, cv2.CAP_DSHOW) + self.camera.set(3, 1280) + self.camera.set(4, 720) + + self.tm_detector = TM.HandDetector() + self.ai_detector = ai.HandDetector() + self.at_detector = ai_two.HandDetector() + + self.tm_main = TM.Main() + self.ai_main = ai.Main() + self.at_main = ai_two.Main() + + def gr_img(self, filedir, diy): + print(filedir) + img = cv2.imread(filedir) + if diy: + cnn = torch.load("CNN.pkl") + cnn_two = torch.load("CNN_two.pkl") + tm_img = self.tm_detector.find_hands(img) + while True: + is_one_hand = self.at_main.gesture_recognition(self.at_detector, img, cnn_two) + if is_one_hand: + not_match = self.ai_main.gesture_recognition_camera(self.ai_detector, img, cnn) + if not_match: + self.tm_main.gesture_recognition(tm_img, self.tm_detector) + + cv2.imshow("camera", img) + key = cv2.waitKey(1) + if cv2.getWindowProperty('camera', cv2.WND_PROP_VISIBLE) < 1: + break + elif key == 27: + break + + def gr_video(self, filedir, diy): + cap = cv2.VideoCapture(filedir) + if diy: + cnn = torch.load("CNN.pkl") + cnn_two = torch.load("CNN_two.pkl") + while True: + ret, img = cap.read() + tm_status = self.tm_main.gesture_recognition(self.tm_detector.find_hands(img), self.tm_detector) + if tm_status and diy: + is_one_hand = self.at_main.gesture_recognition(self.at_detector, img, cnn_two) + if is_one_hand: + self.ai_main.gesture_recognition_camera(self.ai_detector, img, cnn) + + cv2.imshow("camera", img) + key = cv2.waitKey(1) + if cv2.getWindowProperty('camera', cv2.WND_PROP_VISIBLE) < 1: + break + elif key == 27: + break + cap.release() + + def gr_realtime(self, diy): + if diy: + cnn = torch.load("CNN.pkl") + cnn_two = torch.load("CNN_two.pkl") + while True: + frame, img = self.camera.read() + tm_status = self.tm_main.gesture_recognition(self.tm_detector.find_hands(img), self.tm_detector) + if tm_status and diy: + is_one_hand = self.at_main.gesture_recognition(self.at_detector, img, cnn_two) + if is_one_hand: + self.ai_main.gesture_recognition_camera(self.ai_detector, img, cnn) + + cv2.imshow("camera", img) + key = cv2.waitKey(1) + if cv2.getWindowProperty('camera', cv2.WND_PROP_VISIBLE) < 1: + break + elif key == 27: + break + + def ai_input(self): + self.ai_main.make_datasets(self.camera, "ai_datasets", 100) + self.ai_main.train("ai_datasets") + self.at_main.make_datasets(self.camera, "ai_two_datasets", 100) + self.at_main.train("ai_two_datasets") + + +if __name__ == '__main__': + main = Main() + main.gr_img("C:/Users/leafl/Pictures/图片1.png", 0) diff --git a/main.py b/main.py new file mode 100644 index 0000000..a75d424 --- /dev/null +++ b/main.py @@ -0,0 +1,112 @@ +# -*- coding:utf-8 -*- + +""" +信号设计课程小组设计 + +@ by: Leaf +@ date: 2022-05-28 +""" +import gr + +import tkinter as tk +from tkinter import filedialog, Button, Label, Frame, ACTIVE, LEFT +from PIL import Image, ImageTk + + +class DisplayImage: + """用于展示选择的图片""" + def __init__(self, master): + self.master = master + master.title("GUI") + self.Text_lab0 = Label(master, text='已加载图像/视频') + self.Text_lab0.pack(pady=10) + self.image_frame = Frame(master, bd=0, height=300, width=300, bg='white', highlightthickness=2, + highlightbackground='gray', highlightcolor='black') + self.image_frame.pack() + + self.Text_label = Label(master, text='加载待识别影像/视频') + self.Text_label.place(x=60, y=410) + self.Choose_image = Button(master, command=self.choose_img, text="图像", + width=7, default=ACTIVE, borderwidth=0) + self.Choose_image.place(x=50, y=450) + self.Choose_image = Button(master, command=self.choose_video, text="视频", + width=7, default=ACTIVE, borderwidth=0) + self.Choose_image.place(x=120, y=450) + self.Text_label2 = Label(master, text='运行手势识别程序') + self.Text_label2.place(x=60, y=500) + self.image_mosaic = Button(master, command=self.gesture_recognition, text="Gesture recognition", + width=17, default=ACTIVE, borderwidth=0) + self.image_mosaic.place(x=50, y=540) + self.Text_label3 = Label(master, text='运行实时手势识别程序') + self.Text_label3.place(x=300, y=410) + self.realtime = Button(master, command=self.realtime_gr, text="Realtime\n gesture recognition", + width=17, height=6, default=ACTIVE, borderwidth=0) + self.realtime.place(x=300, y=450) + self.Text_label4 = Label(master, text='录入自定义手势') + self.Text_label4.place(x=180, y=610) + self.input = Button(master, command=self.input_image, text="Input gesture", + width=42, default=ACTIVE, borderwidth=0) + self.input.place(x=60, y=650) + + self.gr = gr.Main() + self.temp_dir = "temp" + self.mode = 0 + self.directory = "" + self.diy = 1 + + def choose_img(self): + self.mode = 1 + # 清空框架中的内容 + for widget in self.image_frame.winfo_children(): + widget.destroy() + + self.directory = filedialog.askopenfilename() + + # 布局所选图片 + img = Image.open(self.directory).resize((300, 300)) + img.save(self.temp_dir + "/photo.png") + image = ImageTk.PhotoImage(image=img) + label = Label(self.image_frame, highlightthickness=0, borderwidth=0) + label.configure(image=image) + label.pack(side=LEFT, expand=True) + + def choose_video(self): + # 清空框架中的内容 + self.mode = 2 + for widget in self.image_frame.winfo_children(): + widget.destroy() + + self.directory = filedialog.askopenfilename() + + # 布局所选图片 + img = Image.open(self.temp_dir+"/video.jpg").resize((300, 300)) + img.save(self.temp_dir + "/photo.png") + image = ImageTk.PhotoImage(image=img) + label = Label(self.image_frame, highlightthickness=0, borderwidth=0) + label.configure(image=image) + label.pack(side=LEFT, expand=True) + + def gesture_recognition(self): + if self.mode == 1: + self.gr.gr_img(self.directory, self.diy) + elif self.mode == 2: + self.gr.gr_video(self.directory, self.diy) + + def realtime_gr(self): + self.gr.gr_realtime(self.diy) + + def input_image(self): + self.diy = 1 + self.gr.ai_input() + + +def main(): + window = tk.Tk() + DisplayImage(window) + window.title('手势识别') + window.geometry('500x720') + window.mainloop() + + +if __name__ == '__main__': + main()