The code:#

Please see some brief description of the scripts in this project.

main.py#

This is the main file that will be run. It will call the other files and run the program. It re-uses some components of the dynamic-rim module to read the video pts and ts as well as to get the correct video frame for an specific timestamp. In here you can also see how the audio and video are ts are merged into a single Pandas DataFrame. The DataFrame is also cropped using the start and end event timestamps.

pose.py#

This file contains the main functions to run the densepose. A setup_config function that will load the config file for the model, as well as the weights. It also defines the visualizers, the extractor and more importantly the predictor.

These are passed back to main.

        args.confidence, args.device
    )
    merged_video = merged_video.reset_index(drop=True, inplace=False)

Finally, get_densepose is the main call that will run the densepose on the video. Runs the predictor on the frame, which gives the outputs.

The results are a DensePoseChart and PredictionBoxes.

def get_densepose(
    frame,
    predictor,
    visualizer,
    extractor,
    cfg,
    xy,
    starter=None,
    ender=None,
    timings=0,
    circle_size=50,
    frameid=0,
    progress_bar=None,
    poses_task=None,
    labels_onimg=True,
):
    with torch.no_grad():
        # Let the GPU WARM UP and measure inference time after 60 frames
        if starter is not None and 60 < frameid < (len(timings) + 60):
            starter.record()
            outputs = predictor(frame)["instances"]
            ender.record()
            torch.cuda.synchronize()
            timings[frameid - 60] = starter.elapsed_time(ender)
        else:
            outputs = predictor(frame)["instances"]
    result = {}
    extractor_r = extractor
    if outputs.has("scores"):
        result["scores"] = outputs.get("scores").cpu()
    if outputs.has("pred_boxes"):
        result["pred_boxes_XYXY"] = outputs.get("pred_boxes").tensor.cpu()
        if outputs.has("pred_densepose"):
            if isinstance(outputs.pred_densepose, DensePoseChartPredictorOutput):
                extractor_r = DensePoseResultExtractor()
            elif isinstance(outputs.pred_densepose, DensePoseEmbeddingPredictorOutput):
                extractor_r = DensePoseOutputsExtractor()
            result["pred_densepose"] = extractor_r(outputs)[0]
    logging.debug(f"DensePose result: {result}")
    # execute on outputs
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    frame = np.tile(frame[:, :, np.newaxis], (1, 1, 3)) / 255
    data = extractor(outputs)
    id_part = []
    # As of now, it checks the gaze point for labels from densepose.
    if not np.isnan(xy).any() and xy is not None and len(result["pred_boxes_XYXY"]) > 0:
        if progress_bar is not None and poses_task is not None:
            progress_bar.reset(
                poses_task,
                total=len(result["pred_boxes_XYXY"]),
                description=f"🤸‍♀️ Estimating poses at frame:{frameid}",
            )
        pointsCircle = getpointsCircle(xy, circle_size)
        for point in pointsCircle:
            for i, box in enumerate(result["pred_boxes_XYXY"]):
                if (
                    point[0] > box[0]
                    and point[0] < box[2]
                    and point[1] > box[1]
                    and point[1] < box[3]
                ):
                    # Labels on a person found bounding box
                    labels_bb = result["pred_densepose"][i].labels.cpu().numpy()
                    # Gaze point relative to the bounding box
                    x = int(np.floor(point[0] - box[0]))
                    y = int(np.floor(point[1] - box[1]))
                    x = x - 1 if x != 0 else x
                    y = y - 1 if y != 0 else y
                    id_part.append(labels_bb[y, x])
                else:
                    id_part.append(0)
                if progress_bar is not None and poses_task is not None:
                    progress_bar.advance(poses_task)
    else:
        id_part.append(0)
    # Get id name of the body part gazed at
    # Get unique ids
    id_part = list(set(id_part))
    id_name = []
    for i in range(len(id_part)):
        if id_part != 0:
            id_name.append(PartsDefinition(id_part[i]).name)
    text_id_name = ", ".join(id_name)
    logging.debug(f"DensePose frame {frameid} - looking at part {text_id_name}")

    # Draw segmentation
    frame = (frame * 255).astype(np.uint8)
    if not np.isnan(xy).any() and xy is not None and len(result["pred_boxes_XYXY"]) > 0:
        frame_vis = pl_dp_vis.vis_pose(frame, result, id_part, xy)
    else:
        frame_vis = frame

    # write body part in the bottom left corner of the image
    if labels_onimg:
        cv2.putText(
            frame_vis,
            text_id_name,
            (10, 1000),
            cv2.FONT_HERSHEY_SIMPLEX,
            1,
            (255, 255, 255),
            lineType=1,
        )
    return frame_vis, result, text_id_name, starter, ender, timings, poses_task

This is called at __main__ here:

                (
                    frame,
                    _,
                    id_name,
                    starter,
                    ender,
                    timings,
                    poses_task,
                ) = pose.get_densepose(
                    frame,
                    predictor,
                    visualizer,
                    extractor,
                    cfg,
                    xy,
                    starter,
                    ender,
                    timings,
                    args.circle_size,
                    frameid=num_processed_frames,
                    progress_bar=progress_bar,
                    poses_task=poses_task,
                )  # frame must be BGR

and the predictor, visualizer, config are passed along with the frame, circle_size, and gaze coordinates.

Inference#

On L138 is the call to the predictor and where inference is run.

vis.py#

vis_pose#

A function to visualize the densepose parts, onto video frame.

def vis_pose(frame, result, id_part, xy, bbox=True, scores=True, parts=True):
    """Visualize DensePose data on a frame."""
    for i, box in enumerate(result["pred_boxes_XYXY"]):
        box = np.floor(box.cpu().numpy()).astype(np.int32)
        roi = frame[box[1] : box[3], box[0] : box[2]]
        if bbox:
            cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 180, 0), 2)
        if scores:
            # Put the score on the frame
            cv2.putText(
                frame,
                f"{result['scores'][i]:.2f}",
                (box[0], box[1] - 10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.9,
                (255, 255, 255),
                2,
            )
        if parts:
            labels_bb = result["pred_densepose"][i].labels.cpu().numpy()
            # Resize to bounding box
            labels_bb = cv2.resize(
                labels_bb,
                (box[2] - box[0], box[3] - box[1]),
                interpolation=cv2.INTER_NEAREST,
            )
            # get the background mask (remain as the frame)
            mask_inv = cv2.bitwise_not(labels_bb.astype(np.uint8))
            bkg = cv2.bitwise_and(roi, roi, mask=mask_inv)

            # get the mask of the body part and apply a color map to the parts
            fg = labels_bb * 8
            fg = fg.astype(np.uint8)
            fg = cv2.applyColorMap(fg, cv2.COLORMAP_OCEAN)
            fg = cv2.bitwise_and(fg, fg, mask=labels_bb.astype(np.uint8))

            # plot gazed part in a different color
            if (
                id_part is not None
                and xy[0] < box[2]
                and xy[0] > box[0]
                and xy[1] < box[3]
                and xy[1] > box[1]
            ):
                if len(id_part) == 1 and id_part[0] == 0:
                    continue
                # remove 0 from id_part
                id_part = id_part[1:] if id_part[0] == 0 else id_part
                gazed = labels_bb
                gazed[np.isin(labels_bb, id_part, invert=True)] = 0
                gazed[gazed > 0] = 255
                gazed_mask = gazed.astype(np.uint8)
                g = np.stack(
                    [np.zeros_like(gazed_mask), gazed_mask, gazed_mask],
                    axis=2,
                )
                g = cv2.bitwise_and(g, g, mask=gazed_mask.astype(np.uint8))
                inv_mask = cv2.bitwise_not(gazed_mask.astype(np.uint8))
                fg = cv2.bitwise_and(fg, fg, mask=inv_mask)
                # add the gazed part to the foreground
                fg = cv2.add(fg, g)

            # merge the foreground and background
            blended = cv2.add(bkg, fg)

            # Add transparency
            frame[box[1] : box[3], box[0] : box[2]] = blended
            # cv2.addWeighted(roi, 0.3, blended, 0.7, 0)
    return frame

report#

Generate a plot and csv file with parts count.

def report(pandas_df, out_dir):
    """This function takes the final pandas dataframe and returns a report
    with the number of frames with each body part gazed at.
    """
    parts = pandas_df["densepose"]
    parts = parts.str.replace("BACKGROUND", "")
    parts = parts.str.replace(",", "", 1)
    parts = parts.str.split(",")
    parts = parts.apply(lambda x: [i for i in x if i])
    parts = parts.apply(lambda x: [i.strip() for i in x])
    parts = [item for sublist in parts for item in sublist]
    while any(" " in s for s in parts):
        parts = [i.split(" ") for i in parts]
        parts = [item for sublist in parts for item in sublist]
    if any("" in s for s in parts):
        for s in parts:
            if s == "":
                parts.remove(s)

    # Count the number of times each part is gazed at
    parts_count = {i: parts.count(i) for i in parts}
    parts_count = dict(
        sorted(parts_count.items(), key=lambda item: item[1], reverse=True)
    )
    # Make parts count into a Pandas
    parts_count = pd.DataFrame.from_dict(parts_count, orient="index")
    parts_count.columns = ["count"]
    parts_count.index.name = "part"
    parts_count = parts_count.reset_index()

    # Save it as a csv
    parts_count.to_csv(os.path.join(out_dir, "parts_count.csv"), index=False)

    # Load the graphs from the assets folder
    base_body = cv2.imread(
        os.path.join(os.path.dirname(__file__), "assets/body_shape.png")
    )
    col_body = cv2.imread(
        os.path.join(os.path.dirname(__file__), "assets/body_shape_coloured.png")
    )
    part_pixels = dict()
    for part in PartsColour:
        part_pixels[part.name] = np.where(np.all(col_body == part.value, axis=-1))

    step = 255 / parts_count["count"].max()
    for i, row in parts_count.iterrows():
        part = row["part"]
        count = row["count"]
        base_body[part_pixels[part]] = (
            255 - (count * step),
            255 - (count * step),
            255 - (count * step),
        )

    logos = base_body[:200, :, :]
    base_body = base_body[200:, :, :]
    base_body = cv2.applyColorMap(base_body, cv2.COLORMAP_HOT)

    # Add the logos
    gazemap = np.concatenate((logos, base_body), axis=0)

    # Add a colorbar
    margin = np.full((gazemap.shape[0], 100, 3), 255, dtype=np.uint8)
    colorbar = np.zeros((255, 50, 3), dtype=np.uint8)
    for i in range(255):
        colorbar[254 - i, :, :] = (255 - i, 255 - i, 255 - i)
    colorbar = cv2.applyColorMap(colorbar, cv2.COLORMAP_HOT)

    colorbar = cv2.resize(colorbar, (20, gazemap.shape[0]))

    # add values
    font = cv2.FONT_HERSHEY_SIMPLEX
    fontScale = 0.5
    fontColor = (0, 0, 0)
    lineType = 2

    step = gazemap.shape[0] / parts_count["count"].max()
    for i in range(0, parts_count["count"].max(), 25):
        cv2.putText(
            margin,
            "{}".format(parts_count["count"].max() - i),
            (50, int(np.round(i * step))),
            font,
            fontScale,
            fontColor,
            lineType,
        )
    gazemap = np.concatenate((gazemap, margin), axis=1)
    gazemap = np.concatenate((gazemap, colorbar), axis=1)

    # save the gazemap in rgb
    cv2.cvtColor(gazemap, cv2.COLOR_BGR2RGB)
    cv2.imwrite(os.path.join(out_dir, "gazemap.png"), gazemap)
    return