1#!/usr/bin/env python
2'''
3You can download the converted pb model from https://www.dropbox.com/s/qag9vzambhhkvxr/lip_jppnet_384.pb?dl=0
4or convert the model yourself.
5
6Follow these steps if you want to convert the original model yourself:
7    To get original .meta pre-trained model download https://drive.google.com/file/d/1BFVXgeln-bek8TCbRjN6utPAgRE0LJZg/view
8    For correct convert .meta to .pb model download original repository https://github.com/Engineering-Course/LIP_JPPNet
9    Change script evaluate_parsing_JPPNet-s2.py for human parsing
10    1. Remove preprocessing to create image_batch_origin:
11        with tf.name_scope("create_inputs"):
12        ...
13    Add
14        image_batch_origin = tf.placeholder(tf.float32, shape=(2, None, None, 3), name='input')
15
16    2. Create input
17        image = cv2.imread(path/to/image)
18        image_rev = np.flip(image, axis=1)
19        input = np.stack([image, image_rev], axis=0)
20
21    3. Hardcode image_h and image_w shapes to determine output shapes.
22       We use default INPUT_SIZE = (384, 384) from evaluate_parsing_JPPNet-s2.py.
23        parsing_out1 = tf.reduce_mean(tf.stack([tf.image.resize_images(parsing_out1_100, INPUT_SIZE),
24                                                tf.image.resize_images(parsing_out1_075, INPUT_SIZE),
25                                                tf.image.resize_images(parsing_out1_125, INPUT_SIZE)]), axis=0)
26       Do similarly with parsing_out2, parsing_out3
27    4. Remove postprocessing. Last net operation:
28        raw_output = tf.reduce_mean(tf.stack([parsing_out1, parsing_out2, parsing_out3]), axis=0)
29       Change:
30        parsing_ = sess.run(raw_output, feed_dict={'input:0': input})
31
32    5. To save model after sess.run(...) add:
33        input_graph_def = tf.get_default_graph().as_graph_def()
34        output_node = "Mean_3"
35        output_graph_def = tf.graph_util.convert_variables_to_constants(sess, input_graph_def, output_node)
36
37        output_graph = "LIP_JPPNet.pb"
38        with tf.gfile.GFile(output_graph, "wb") as f:
39            f.write(output_graph_def.SerializeToString())'
40'''
41
42import argparse
43import os.path
44import numpy as np
45import cv2 as cv
46
47
48backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
49            cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
50targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD,
51           cv.dnn.DNN_TARGET_HDDL, cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
52
53
54def preprocess(image):
55    """
56    Create 4-dimensional blob from image and flip image
57    :param image: input image
58    """
59    image_rev = np.flip(image, axis=1)
60    input = cv.dnn.blobFromImages([image, image_rev], mean=(104.00698793, 116.66876762, 122.67891434))
61    return input
62
63
64def run_net(input, model_path, backend, target):
65    """
66    Read network and infer model
67    :param model_path: path to JPPNet model
68    :param backend: computation backend
69    :param target: computation device
70    """
71    net = cv.dnn.readNet(model_path)
72    net.setPreferableBackend(backend)
73    net.setPreferableTarget(target)
74    net.setInput(input)
75    out = net.forward()
76    return out
77
78
79def postprocess(out, input_shape):
80    """
81    Create a grayscale human segmentation
82    :param out: network output
83    :param input_shape: input image width and height
84    """
85    # LIP classes
86    # 0 Background
87    # 1 Hat
88    # 2 Hair
89    # 3 Glove
90    # 4 Sunglasses
91    # 5 UpperClothes
92    # 6 Dress
93    # 7 Coat
94    # 8 Socks
95    # 9 Pants
96    # 10 Jumpsuits
97    # 11 Scarf
98    # 12 Skirt
99    # 13 Face
100    # 14 LeftArm
101    # 15 RightArm
102    # 16 LeftLeg
103    # 17 RightLeg
104    # 18 LeftShoe
105    # 19 RightShoe
106    head_output, tail_output = np.split(out, indices_or_sections=[1], axis=0)
107    head_output = head_output.squeeze(0)
108    tail_output = tail_output.squeeze(0)
109
110    head_output = np.stack([cv.resize(img, dsize=input_shape) for img in head_output[:, ...]])
111    tail_output = np.stack([cv.resize(img, dsize=input_shape) for img in tail_output[:, ...]])
112
113    tail_list = np.split(tail_output, indices_or_sections=list(range(1, 20)), axis=0)
114    tail_list = [arr.squeeze(0) for arr in tail_list]
115    tail_list_rev = [tail_list[i] for i in range(14)]
116    tail_list_rev.extend([tail_list[15], tail_list[14], tail_list[17], tail_list[16], tail_list[19], tail_list[18]])
117    tail_output_rev = np.stack(tail_list_rev, axis=0)
118    tail_output_rev = np.flip(tail_output_rev, axis=2)
119    raw_output_all = np.mean(np.stack([head_output, tail_output_rev], axis=0), axis=0, keepdims=True)
120    raw_output_all = np.argmax(raw_output_all, axis=1)
121    raw_output_all = raw_output_all.transpose(1, 2, 0)
122    return raw_output_all
123
124
125def decode_labels(gray_image):
126    """
127    Colorize image according to labels
128    :param gray_image: grayscale human segmentation result
129    """
130    height, width, _ = gray_image.shape
131    colors = [(0, 0, 0), (128, 0, 0), (255, 0, 0), (0, 85, 0), (170, 0, 51), (255, 85, 0),
132              (0, 0, 85), (0, 119, 221), (85, 85, 0), (0, 85, 85), (85, 51, 0), (52, 86, 128),
133              (0, 128, 0), (0, 0, 255), (51, 170, 221), (0, 255, 255),(85, 255, 170),
134              (170, 255, 85), (255, 255, 0), (255, 170, 0)]
135
136    segm = np.stack([colors[idx] for idx in gray_image.flatten()])
137    segm = segm.reshape(height, width, 3).astype(np.uint8)
138    segm = cv.cvtColor(segm, cv.COLOR_BGR2RGB)
139    return segm
140
141
142def parse_human(image, model_path, backend=cv.dnn.DNN_BACKEND_OPENCV, target=cv.dnn.DNN_TARGET_CPU):
143    """
144    Prepare input for execution, run net and postprocess output to parse human.
145    :param image: input image
146    :param model_path: path to JPPNet model
147    :param backend: name of computation backend
148    :param target: name of computation target
149    """
150    input = preprocess(image)
151    input_h, input_w = input.shape[2:]
152    output = run_net(input, model_path, backend, target)
153    grayscale_out = postprocess(output, (input_w, input_h))
154    segmentation = decode_labels(grayscale_out)
155    return segmentation
156
157
158if __name__ == '__main__':
159    parser = argparse.ArgumentParser(description='Use this script to run human parsing using JPPNet',
160                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
161    parser.add_argument('--input', '-i', required=True, help='Path to input image.')
162    parser.add_argument('--model', '-m', default='lip_jppnet_384.pb', help='Path to pb model.')
163    parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
164                        help="Choose one of computation backends: "
165                             "%d: automatically (by default), "
166                             "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
167                             "%d: OpenCV implementation, "
168                             "%d: VKCOM, "
169                             "%d: CUDA"% backends)
170    parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
171                        help='Choose one of target computation devices: '
172                             '%d: CPU target (by default), '
173                             '%d: OpenCL, '
174                             '%d: OpenCL fp16 (half-float precision), '
175                             '%d: NCS2 VPU, '
176                             '%d: HDDL VPU, '
177                             '%d: Vulkan, '
178                             '%d: CUDA, '
179                             '%d: CUDA fp16 (half-float preprocess)' % targets)
180    args, _ = parser.parse_known_args()
181
182    if not os.path.isfile(args.model):
183        raise OSError("Model not exist")
184
185    image = cv.imread(args.input)
186    output = parse_human(image, args.model, args.backend, args.target)
187    winName = 'Deep learning human parsing in OpenCV'
188    cv.namedWindow(winName, cv.WINDOW_AUTOSIZE)
189    cv.imshow(winName, output)
190    cv.waitKey()
191