The code:#

Please see some brief description of the scripts in this project.

__main__.py#

This is the main file that will be run. It will call the other files and run the program. It re-uses some components of the dynamic-rim module to read the video pts and ts as well as to get the correct video frame for an specific timestamp. In here you can also see how the audio and video are ts are merged into a single Pandas DataFrame. The DataFrame is also cropped using the start and end event timestamps.

pose.py#

This file contains the main functions to run the densepose. A setup_config function that will load the config file for the model, as well as the weights. It also defines the visualizers, the extractor and more importantly the predictor.

These are passed back to main.

1        args.confidence, args.device
2    )
3    merged_video = merged_video.reset_index(drop=True, inplace=False)
4

Finally, get_densepose is the main call that will run the densepose on the video. Runs the predictor on the frame, which gives the outputs.

The results are a DensePoseChart and PredictionBoxes.

  1def get_densepose(
  2    frame,
  3    predictor,
  4    visualizer,
  5    extractor,
  6    cfg,
  7    xy,
  8    starter=None,
  9    ender=None,
 10    timings=0,
 11    circle_size=50,
 12    frameid=0,
 13    progress_bar=None,
 14    poses_task=None,
 15    labels_onimg=True,
 16):
 17    with torch.no_grad():
 18        # Let the GPU WARM UP and measure inference time after 60 frames
 19        if starter is not None and 60 < frameid < (len(timings) + 60):
 20            starter.record()
 21            outputs = predictor(frame)["instances"]
 22            ender.record()
 23            torch.cuda.synchronize()
 24            timings[frameid - 60] = starter.elapsed_time(ender)
 25        else:
 26            outputs = predictor(frame)["instances"]
 27    result = {}
 28    extractor_r = extractor
 29    if outputs.has("scores"):
 30        result["scores"] = outputs.get("scores").cpu()
 31    if outputs.has("pred_boxes"):
 32        result["pred_boxes_XYXY"] = outputs.get("pred_boxes").tensor.cpu()
 33        if outputs.has("pred_densepose"):
 34            if isinstance(outputs.pred_densepose, DensePoseChartPredictorOutput):
 35                extractor_r = DensePoseResultExtractor()
 36            elif isinstance(outputs.pred_densepose, DensePoseEmbeddingPredictorOutput):
 37                extractor_r = DensePoseOutputsExtractor()
 38            result["pred_densepose"] = extractor_r(outputs)[0]
 39    logging.debug(f"DensePose result: {result}")
 40    # execute on outputs
 41    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
 42    frame = np.tile(frame[:, :, np.newaxis], (1, 1, 3)) / 255
 43    data = extractor(outputs)
 44    id_part = []
 45    # As of now, it checks the gaze point for labels from densepose.
 46    if not np.isnan(xy).any() and xy is not None and len(result["pred_boxes_XYXY"]) > 0:
 47        if progress_bar is not None and poses_task is not None:
 48            progress_bar.reset(
 49                poses_task,
 50                total=len(result["pred_boxes_XYXY"]),
 51                description=f"🤸‍♀️ Estimating poses at frame:{frameid}",
 52            )
 53        pointsCircle = getpointsCircle(xy, circle_size)
 54        for point in pointsCircle:
 55            for i, box in enumerate(result["pred_boxes_XYXY"]):
 56                if (
 57                    point[0] > box[0]
 58                    and point[0] < box[2]
 59                    and point[1] > box[1]
 60                    and point[1] < box[3]
 61                ):
 62                    # Labels on a person found bounding box
 63                    labels_bb = result["pred_densepose"][i].labels.cpu().numpy()
 64                    # Gaze point relative to the bounding box
 65                    x = int(np.floor(point[0] - box[0]))
 66                    y = int(np.floor(point[1] - box[1]))
 67                    x = x - 1 if x != 0 else x
 68                    y = y - 1 if y != 0 else y
 69                    id_part.append(labels_bb[y, x])
 70                else:
 71                    id_part.append(0)
 72                if progress_bar is not None and poses_task is not None:
 73                    progress_bar.advance(poses_task)
 74    else:
 75        id_part.append(0)
 76    # Get id name of the body part gazed at
 77    # Get unique ids
 78    id_part = list(set(id_part))
 79    id_name = []
 80    for i in range(len(id_part)):
 81        if id_part != 0:
 82            id_name.append(PartsDefinition(id_part[i]).name)
 83    text_id_name = ", ".join(id_name)
 84    logging.debug(f"DensePose frame {frameid} - looking at part {text_id_name}")
 85
 86    # Draw segmentation
 87    frame = (frame * 255).astype(np.uint8)
 88    if not np.isnan(xy).any() and xy is not None and len(result["pred_boxes_XYXY"]) > 0:
 89        frame_vis = pl_dp_vis.vis_pose(frame, result, id_part, xy)
 90    else:
 91        frame_vis = frame
 92
 93    # write body part in the bottom left corner of the image
 94    if labels_onimg:
 95        cv2.putText(
 96            frame_vis,
 97            text_id_name,
 98            (10, 1000),
 99            cv2.FONT_HERSHEY_SIMPLEX,
100            1,
101            (255, 255, 255),
102            lineType=1,
103        )
104    return frame_vis, result, text_id_name, starter, ender, timings, poses_task

This is called at __main__ here:

 1                (
 2                    frame,
 3                    _,
 4                    id_name,
 5                    starter,
 6                    ender,
 7                    timings,
 8                    poses_task,
 9                ) = pose.get_densepose(
10                    frame,
11                    predictor,
12                    visualizer,
13                    extractor,
14                    cfg,
15                    xy,
16                    starter,
17                    ender,
18                    timings,
19                    args.circle_size,
20                    frameid=num_processed_frames,
21                    progress_bar=progress_bar,
22                    poses_task=poses_task,
23                )  # frame must be BGR

and the predictor, visualizer, config are passed along with the frame, circle_size, and gaze coordinates.

Inference#

On L138 is the call to the predictor and where inference is run.

vis.py#

vis_pose#

A function to visualize the densepose parts, onto video frame.

 1def vis_pose(frame, result, id_part, xy, bbox=True, scores=True, parts=True):
 2    """Visualize DensePose data on a frame."""
 3    for i, box in enumerate(result["pred_boxes_XYXY"]):
 4        box = np.floor(box.cpu().numpy()).astype(np.int32)
 5        roi = frame[box[1] : box[3], box[0] : box[2]]
 6        if bbox:
 7            cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 180, 0), 2)
 8        if scores:
 9            # Put the score on the frame
10            cv2.putText(
11                frame,
12                f"{result['scores'][i]:.2f}",
13                (box[0], box[1] - 10),
14                cv2.FONT_HERSHEY_SIMPLEX,
15                0.9,
16                (255, 255, 255),
17                2,
18            )
19        if parts:
20            labels_bb = result["pred_densepose"][i].labels.cpu().numpy()
21            # Resize to bounding box
22            labels_bb = cv2.resize(
23                labels_bb,
24                (box[2] - box[0], box[3] - box[1]),
25                interpolation=cv2.INTER_NEAREST,
26            )
27            # get the background mask (remain as the frame)
28            mask_inv = cv2.bitwise_not(labels_bb.astype(np.uint8))
29            bkg = cv2.bitwise_and(roi, roi, mask=mask_inv)
30
31            # get the mask of the body part and apply a color map to the parts
32            fg = labels_bb * 8
33            fg = fg.astype(np.uint8)
34            fg = cv2.applyColorMap(fg, cv2.COLORMAP_OCEAN)
35            fg = cv2.bitwise_and(fg, fg, mask=labels_bb.astype(np.uint8))
36
37            # plot gazed part in a different color
38            if (
39                id_part is not None
40                and xy[0] < box[2]
41                and xy[0] > box[0]
42                and xy[1] < box[3]
43                and xy[1] > box[1]
44            ):
45                if len(id_part) == 1 and id_part[0] == 0:
46                    continue
47                # remove 0 from id_part
48                id_part = id_part[1:] if id_part[0] == 0 else id_part
49                gazed = labels_bb
50                gazed[np.isin(labels_bb, id_part, invert=True)] = 0
51                gazed[gazed > 0] = 255
52                gazed_mask = gazed.astype(np.uint8)
53                g = np.stack(
54                    [np.zeros_like(gazed_mask), gazed_mask, gazed_mask],
55                    axis=2,
56                )
57                g = cv2.bitwise_and(g, g, mask=gazed_mask.astype(np.uint8))
58                inv_mask = cv2.bitwise_not(gazed_mask.astype(np.uint8))
59                fg = cv2.bitwise_and(fg, fg, mask=inv_mask)
60                # add the gazed part to the foreground
61                fg = cv2.add(fg, g)
62
63            # merge the foreground and background
64            blended = cv2.add(bkg, fg)
65
66            # Add transparency
67            frame[box[1] : box[3], box[0] : box[2]] = blended
68            # cv2.addWeighted(roi, 0.3, blended, 0.7, 0)
69    return frame

report#

Generate a plot and csv file with parts count.

 1def report(pandas_df, out_dir):
 2    """This function takes the final pandas dataframe and returns a report
 3    with the number of frames with each body part gazed at.
 4    """
 5    parts = pandas_df["densepose"]
 6    parts = parts.str.replace("BACKGROUND", "")
 7    parts = parts.str.replace(",", "", 1)
 8    parts = parts.str.split(",")
 9    parts = parts.apply(lambda x: [i for i in x if i])
10    parts = parts.apply(lambda x: [i.strip() for i in x])
11    parts = [item for sublist in parts for item in sublist]
12    while any(" " in s for s in parts):
13        parts = [i.split(" ") for i in parts]
14        parts = [item for sublist in parts for item in sublist]
15    if any("" in s for s in parts):
16        for s in parts:
17            if s == "":
18                parts.remove(s)
19
20    # Count the number of times each part is gazed at
21    parts_count = {i: parts.count(i) for i in parts}
22    parts_count = dict(
23        sorted(parts_count.items(), key=lambda item: item[1], reverse=True)
24    )
25    # Make parts count into a Pandas
26    parts_count = pd.DataFrame.from_dict(parts_count, orient="index")
27    parts_count.columns = ["count"]
28    parts_count.index.name = "part"
29    parts_count = parts_count.reset_index()
30
31    # Save it as a csv
32    parts_count.to_csv(os.path.join(out_dir, "parts_count.csv"), index=False)
33
34    # Load the graphs from the assets folder
35    base_body = cv2.imread(
36        os.path.join(os.path.dirname(__file__), "assets/body_shape.png")
37    )
38    col_body = cv2.imread(
39        os.path.join(os.path.dirname(__file__), "assets/body_shape_coloured.png")
40    )
41    part_pixels = dict()
42    for part in PartsColour:
43        part_pixels[part.name] = np.where(np.all(col_body == part.value, axis=-1))
44
45    step = 255 / parts_count["count"].max()
46    for i, row in parts_count.iterrows():
47        part = row["part"]
48        count = row["count"]
49        base_body[part_pixels[part]] = (
50            255 - (count * step),
51            255 - (count * step),
52            255 - (count * step),
53        )
54
55    logos = base_body[:200, :, :]
56    base_body = base_body[200:, :, :]
57    base_body = cv2.applyColorMap(base_body, cv2.COLORMAP_HOT)
58
59    # Add the logos
60    gazemap = np.concatenate((logos, base_body), axis=0)
61
62    # Add a colorbar
63    margin = np.full((gazemap.shape[0], 100, 3), 255, dtype=np.uint8)
64    colorbar = np.zeros((255, 50, 3), dtype=np.uint8)
65    for i in range(255):
66        colorbar[254 - i, :, :] = (255 - i, 255 - i, 255 - i)
67    colorbar = cv2.applyColorMap(colorbar, cv2.COLORMAP_HOT)
68
69    colorbar = cv2.resize(colorbar, (20, gazemap.shape[0]))
70
71    # add values
72    font = cv2.FONT_HERSHEY_SIMPLEX
73    fontScale = 0.5
74    fontColor = (0, 0, 0)
75    lineType = 2
76
77    step = gazemap.shape[0] / parts_count["count"].max()
78    for i in range(0, parts_count["count"].max(), 25):
79        cv2.putText(
80            margin,
81            "{}".format(parts_count["count"].max() - i),
82            (50, int(np.round(i * step))),
83            font,
84            fontScale,
85            fontColor,
86            lineType,
87        )
88    gazemap = np.concatenate((gazemap, margin), axis=1)
89    gazemap = np.concatenate((gazemap, colorbar), axis=1)
90
91    # save the gazemap in rgb
92    cv2.cvtColor(gazemap, cv2.COLOR_BGR2RGB)
93    cv2.imwrite(os.path.join(out_dir, "gazemap.png"), gazemap)
94    return