cv.solvePnP估计人脸姿态时，对输入数据不响应？

Z时代
2024-01-10
分类：技术分享

from __future__ import print_function
import os
import cv2
import sys
import numpy as np
import math
import json
# 3D model points.
MODEL_POINTS_68 = np.array([
     [-53.30021857,  -15.27635071,  60.43970857],
     [-52.85842643,   -1.810875,    59.18390214],
     [-51.25744357, 11.67454214,  58.42188571],
     [-48.626315,   24.63487214,  57.20655286],
     [-43.58355,    36.41352214,  53.98811286],
     [-35.42554,    46.354015,    48.35620143],
     [-25.24848857, 54.04395214,  39.57465214],
     [-13.64442429, 59.62245214,  28.96935357],
     [ -0.80573857, 61.05416214,  25.76168714],
     [ 11.60732,    59.40927786,  30.09232929],
     [ 22.44441643, 53.81683,     40.08901643],
     [ 32.24683929, 46.23280714,  48.37045   ],
     [ 40.083355,   36.30976214,  53.26906357],
     [ 45.11549929, 24.19315786,  55.59597786],
     [ 47.84661071, 11.00381857,  56.59250643],
     [ 49.47115857,   -2.32260857,  57.92187643],
     [ 50.10645643,  -15.67903571,  60.06320429],
     [-44.53299214,  -29.24790071,  36.57320143],
     [-37.510265,    -35.96803857,  31.583005  ],
     [-27.87970214,  -38.27258143,  26.70722214],
     [-18.03538357,  -37.58489857,  21.67254214],
     [ -9.18749714,  -34.48015571,  17.84333   ],
     [  7.73560929,  -34.98320857,  17.78346857],
     [ 17.05676643,  -38.20546071,  21.16640143],
     [ 26.50843214,  -38.83083714,  25.66079071],
     [ 35.68873143,  -36.34843286,  30.21249643],
     [ 42.01933429,  -29.83979571,  34.68843286],
     [ -0.40917357,  -24.128035,    16.83373214],
     [ -0.30141,     -16.12798571,  11.28371929],
     [ -0.16755643,   -8.23575357,   5.47430286],
     [  0.,          0.,          -0.        ],
     [-10.55176571,  1.85589643, -14.38000214],
     [ -5.12874214,  3.39399214, -16.81191714],
     [  0.39708571,  4.68778571, -18.53174857],
     [  5.90892786,  3.32928929, -16.92552929],
     [ 10.86739357,  1.88789,    -14.89868357],
     [-32.89092143,  -26.76529357,   5.027135  ],
     [-26.91049143,  -30.52179286,   2.15801214],
     [-19.91704,     -30.50822643,   0.96687786],
     [-14.03447714,  -26.25338714,  -0.07934857],
     [-20.194975,    -25.09606643,  -0.105195  ],
     [-27.20172714,  -24.94217357,   1.05472286],
     [ 13.76133429,  -26.45164714,  -0.47553286],
     [ 19.92442214,  -30.95888929,   0.1769    ],
     [ 26.74109214,  -30.79344429,   1.21173929],
     [ 32.26486071,  -27.20465357,   3.49583071],
     [ 27.28318143,  -25.38001714,   0.202115  ],
     [ 20.54642071,  -25.34592071,  -0.837625  ],
     [-20.65447643, 20.43765429,  -1.60022143],
     [-12.52371,    15.83727643, -11.38166786],
     [ -4.77470714, 13.59217929, -16.15096786],
     [  0.27214357, 14.80079857, -16.96316929],
     [  5.98245929, 13.59675714, -16.22999643],
     [ 13.48329857, 15.99579214, -11.150485  ],
     [ 20.56743714, 20.05708857,  -2.29813786],
     [ 13.61255286, 25.92732,    -10.70571214],
     [  6.39741071, 28.31041071, -16.110175  ],
     [  0.272535,   28.85403357, -16.85116143],
     [ -5.30635357, 28.454575,   -16.00436143],
     [-12.97188143, 26.19849929, -10.80136214],
     [-17.41249286, 20.48412214,  -3.41834571],
     [ -4.92688071, 18.19712571, -14.92410143],
     [  0.24333071, 18.58162071, -15.87177071],
     [  6.03194429, 18.09014143, -15.01822857],
     [ 17.48176643, 20.23072,     -4.08055429],
     [  6.03511857, 21.85444,    -14.76534929],
     [  0.14665857, 22.43481286, -15.64547857],
     [ -5.14161857, 22.03205429, -14.52001571]], dtype=float)
class PoseEstimator:
    """Estimate head pose according to the facial landmarks"""
    def __init__(self, img_size=(192, 256)):
        self.size = img_size    
        self.landmarks_general_68 = MODEL_POINTS_68
        self.focal_length = 962
        # self.focal_length = self.size[1]
        self.camera_center = (self.size[1] / 2, self.size[0] / 2)
        self.camera_matrix = np.array(
            [[self.focal_length, 0, self.camera_center[0]],
             [0, self.focal_length, self.camera_center[1]],
             [0, 0, 1]], dtype=float)
        # Assuming no lens distortion
        self.dist_coeefs = np.zeros((4, 1))
        # Rotation vector and translation vector; 原代码给它设置了初值，不知道怎么来的？
        """self.r_vec = np.array([[0], [0], [0]], dtype=float)
        self.t_vec = np.array([[0], [0], [0]], dtype=float)"""
        self.r_vec = np.array([[0.01891013], [0.08560084], [-3.14392813]])
        self.t_vec = np.array([[-14.97821226], [-10.62040383], [-2053.03596872]])
    def get_euler_angle(self, rotation_vector):
        # calc rotation angles
        theta = cv2.norm(rotation_vector, cv2.NORM_L2)
        # transform to quaterniond
        w = math.cos(theta / 2)
        x = math.sin(theta / 2)*rotation_vector[0][0] / theta
        y = math.sin(theta / 2)*rotation_vector[1][0] / theta
        z = math.sin(theta / 2)*rotation_vector[2][0] / theta
        # pitch (x-axis rotation)
        t0 = 2.0 * (w*x + y*z)
        t1 = 1.0 - 2.0*(x**2 + y**2)
        pitch = math.atan2(t0, t1)
        # yaw (y-axis rotation)
        t2 = 2.0 * (w*y - z*x)
        if t2 > 1.0:
            t2 = 1.0
        if t2 < -1.0:
            t2 = -1.0
        yaw = math.asin(t2)
        # roll (z-axis rotation)
        t3 = 2.0 * (w*z + x*y)
        t4 = 1.0 - 2.0*(y**2 + z**2)
        roll = math.atan2(t3, t4)
        return pitch, yaw, roll
    def solve_pose_by_68_points(self, image_points):
        print(self.landmarks_general_68.shape)
        print(image_points.shape)
        success, rotation_vector, translation_vector = cv2.solvePnP(
            self.landmarks_general_68,
            image_points,
            self.camera_matrix,
            self.dist_coeefs,
            rvec=self.r_vec,
            tvec=self.t_vec,
            flags=cv2.SOLVEPNP_ITERATIVE,
            useExtrinsicGuess=True)
            #flags=SOLVEPNP_EPNP)
        print(success)
        print(self.r_vec)
        return rotation_vector, translation_vector
if __name__ == "__main__":
    img = cv2.imread(IMAGE_PATH)
    img = cv2.resize(img, (192, 256))
    landmarks_load = json.load(open(LANDMARKS_ROOT, 'r'))
    landmarks_98 = get_landmarks(IMAGE_PATH, landmarks_load)
    landmarks_98 = np.array(landmarks_98, dtype=float)*4
    landmarks_68 = get_68from98(landmarks_98)
    print(landmarks_68)
    pose_estimator = PoseEstimator(img_size=img.shape)
    rotation_vector, translation_vector = pose_estimator.solve_pose_by_68_points(landmarks_68)
    pitch, yaw, roll = pose_estimator.get_euler_angle(rotation_vector)
    def _radian2angle(r):
        return (r/math.pi)*180
    Y, X, Z = map(_radian2angle, [pitch, yaw, roll])
    line = 'Y:{:.1f}\nX:{:.1f}\nZ:{:.1f}'.format(Y,X,Z)
    print('{},{}'.format(os.path.basename(IMAGE_PATH), line.replace('\n',',')))
    print('yaw:{}, pitch:{}, roll:{}'.format(yaw, pitch, roll))    print('Translation vector:{}'.format(translation_vector))

参照https://blog.csdn.net/ChuiGeD...，进行人脸姿态估计，使用cv.solvePnP函数将68点标准3D人脸模型拟合图片上预标注的68点2D关键点。
我的代码修改了参考代码中r_vec与t_vec的初始值；将68个3D点进行了坐标轴方向变换（x轴左负右正、y轴上负下正、z轴近负远正），原点变换（鼻尖，点[30]），尺度缩放（近似图片中人脸的平均尺寸）。根据KDEF数据集的相机参数推算了焦距，其余相机内参维持了原始代码的值。
经测试发现，计算得到的角度完全不符合实际，且采用不同姿态的图片数据输入，获取的输出差别很细微。
而当我大幅度修改焦距、或者将r_vec、t_vec还原为参考代码的初值时，结果会发生显著变化且仍错误，且仍不对图片输入的变化起反应。
r_vec与t_vec根据函数描述似乎是用来存储输出结果的容器，不理解为什么它们初值的变化会影响输出结果？感觉是函数没有完成预期的计算效果？
人脸姿态估计这一步我也不需要做得很精确，但是对我项目的后续工作不可或缺，在网上查了一圈基本是用的这个cv函数模块。请问最好有使用过这个函数或者完成过类似姿态估计任务的大佬，分析一下问题或者分享一下经验？谢谢！！~~

后续我会贴出测试用的正面脸、侧面脸、3D点截取前两维的3D点数据，作为测试输入。这样的话，2D点的输入处理函数就可以忽略。
正面人脸：

LANDMARKS_68_S = np.array([
        [43.0, 126.66666412353516],
        [42.0, 133.33333587646484],
        [46.0, 148.0],
        [47.0, 162.66666412353516],
        [50.0, 177.33333587646484],
        [57.0, 189.3333282470703],
        [68.0, 198.6666717529297],
        [83.0, 211.1111094156901],
        [97.0, 217.3333282470703],
        [109.0, 210.66666412353516],
        [120.0, 201.3333282470703],
        [128.0, 192.0],
        [138.0, 181.33333587646484],
        [142.0, 168.0],
        [146.0, 156.0],
        [83.0, 211.1111094156901],
        [143.0, 121.33333587646484],
        [45.0, 121.33333587646484],
        [54.0, 114.66666793823242],
        [66.0, 116.0],
        [73.0, 117.33333587646484],
        [82.0, 118.66666412353516],
        [106.0, 118.66666412353516],
        [114.0, 118.66666793823242],
        [121.0, 118.66666793823242],
        [129.0, 112.0],
        [137.0, 118.66666412353516],
        [93.0, 129.3333282470703],
        [93.0, 134.6666717529297],
        [95.0, 150.6666717529297],
        [93.0, 158.6666717529297],
        [81.0, 161.3333282470703],
        [89.0, 164.0],
        [95.0, 169.3333282470703],
        [103.0, 164.0],
        [109.0, 158.6666717529297],
        [57.0, 132.0],
        [61.0, 124.0],
        [75.0, 126.66666412353516],
        [81.0, 129.3333282470703],
        [75.0, 137.3333282470703],
        [65.0, 137.3333282470703],
        [107.0, 132.0],
        [111.0, 126.66666412353516],
        [121.0, 124.0],
        [131.0, 124.0],
        [127.0, 129.3333282470703],
        [113.0, 132.0],
        [69.0, 172.0],
        [83.0, 169.3333282470703],
        [91.0, 169.3333282470703],
        [93.0, 169.3333282470703],
        [101.0, 166.6666717529297],
        [109.0, 169.3333282470703],
        [121.0, 169.3333282470703],
        [115.0, 182.6666717529297],
        [105.0, 188.0],
        [97.0, 188.0],
        [85.0, 190.6666717529297],
        [79.0, 182.6666717529297],
        [73.0, 172.0],
        [85.0, 172.0],
        [93.0, 172.0],
        [105.0, 174.6666717529297],
        [117.0, 172.0],
        [109.0, 182.6666717529297],
        [97.0, 185.3333282470703],        [83.0, 185.3333282470703]], dtype=float)

右转45度人脸：

LANDMARKS_68_HR = np.array([
        [65.0, 121.33333587646484],
        [66.0, 133.33333587646484],
        [67.0, 148.0],
        [68.0, 172.0],
        [73.0, 181.3333282470703],
        [84.0, 196.0],
        [99.0, 202.6666717529297],
        [117.66666666666667, 211.1111094156901],
        [135.0, 209.3333282470703],
        [139.0, 201.33333587646484],
        [143.0, 192.0],
        [147.0, 178.6666717529297],
        [150.0, 169.33333587646484],
        [155.0, 158.66666412353516],
        [159.0, 153.33333587646484],
        [117.66666666666667, 211.1111094156901],
        [157.0, 126.66666412353516],
        [95.0, 121.33333587646484],
        [103.0, 120.0],
        [115.0, 116.0],
        [122.0, 116.0],
        [131.0, 121.33333206176758],
        [145.0, 121.33333206176758],
        [149.0, 122.66666793823242],
        [154.0, 118.66666793823242],
        [156.0, 117.33333587646484],
        [159.0, 113.33333587646484],
        [143.0, 132.0],
        [143.0, 140.0],
        [147.0, 153.3333282470703],
        [149.0, 156.0],
        [129.0, 161.3333282470703],
        [133.0, 164.0],
        [143.0, 164.0],
        [145.0, 161.3333282470703],
        [151.0, 161.3333282470703],
        [101.0, 132.0],
        [109.0, 124.0],
        [123.0, 124.0],
        [127.0, 132.0],
        [121.0, 137.3333282470703],
        [111.0, 134.6666717529297],
        [145.0, 132.0],
        [145.0, 126.66666412353516],
        [153.0, 124.0],
        [153.0, 124.0],
        [153.0, 129.3333282470703],
        [145.0, 129.3333282470703],
        [111.0, 172.0],
        [121.0, 169.3333282470703],
        [139.0, 174.6666717529297],
        [143.0, 172.0],
        [143.0, 174.6666717529297],
        [145.0, 172.0],
        [145.0, 172.0],
        [141.0, 182.6666717529297],
        [143.0, 188.0],
        [139.0, 190.6666717529297],
        [127.0, 185.3333282470703],
        [119.0, 185.3333282470703],
        [111.0, 172.0],
        [127.0, 172.0],
        [137.0, 172.0],
        [143.0, 172.0],
        [141.0, 172.0],
        [143.0, 180.0],
        [139.0, 185.3333282470703],        [121.0, 180.0]], dtype=float)

标准人脸：

LANDMARKS_68 = np.array([
        [42.69978143, 112.72364929],
        [43.14157357, 126.189125],
        [44.74255643, 139.67454214],
        [47.373685, 152.63487214],
        [52.41645, 164.41352214],
        [60.57446, 174.354015],
        [70.75151143, 182.04395214],
        [82.35557571, 187.62245214],
        [95.19426143, 189.05416214000002],
        [107.60732, 187.40927786],
        [118.44441643, 181.81683],
        [128.24683929, 174.23280714],
        [136.08335499999998, 164.30976214],
        [141.11549929, 152.19315785999999],
        [143.84661071, 139.00381857],
        [145.47115857, 125.67739143],
        [146.10645643, 112.32096429],
        [51.46700786, 98.75209929],
        [58.489735, 92.03196143],
        [68.12029786, 89.72741857],
        [77.96461643, 90.41510142999999],
        [86.81250286, 93.51984429000001],
        [103.73560929, 93.01679143],
        [113.05676643, 89.79453929],
        [122.50843214, 89.16916286],
        [131.68873143, 91.65156714],
        [138.01933429000002, 98.16020429],
        [95.59082643, 103.871965],
        [95.69859, 111.87201429],
        [95.83244357, 119.76424643],
        [96.0, 128.0],
        [85.44823429, 129.85589643],
        [90.87125786, 131.39399214],
        [96.39708571, 132.68778571],
        [101.90892786, 131.32928929],
        [106.86739357, 129.88789],
        [63.10907857, 101.23470643],
        [69.08950856999999, 97.47820714],
        [76.08296, 97.49177356999999],
        [81.96552286, 101.74661286],
        [75.805025, 102.90393356999999],
        [68.79827286, 103.05782643],
        [109.76133429000001, 101.54835286],
        [115.92442214, 97.04111071],
        [122.74109214, 97.20655571],
        [128.26486071, 100.79534643],
        [123.28318143, 102.61998286],
        [116.54642071, 102.65407929],
        [75.34552357, 148.43765429],
        [83.47629, 143.83727643],
        [91.22529286, 141.59217929],
        [96.27214357, 142.80079857],
        [101.98245929, 141.59675714],
        [109.48329857, 143.99579214],
        [116.56743714, 148.05708857],
        [109.61255286, 153.92732],
        [102.39741071, 156.31041070999999],
        [96.272535, 156.85403357],
        [90.69364643, 156.454575],
        [83.02811857, 154.19849929],
        [78.58750714, 148.48412214],
        [91.07311929, 146.19712571],
        [96.24333071, 146.58162071],
        [102.03194429, 146.09014143],
        [113.48176643, 148.23072],
        [102.03511857, 149.85444],
        [96.14665857, 150.43481286],        [90.85838143, 150.03205429]], dtype=float)