[Goal]

  • 같은 공간이지만 view point가 많이 다른 2개의 이미지에 대한 robust feature matching을 할 수 있다.

[LightGlue Github page]


[Advantages]

  • GPU 소모가 많이 안들면서 feature matching 이 잘되는 model
  • Viewpoint가 많이 달라도 robust하게 feature matching 이 잘되는 model

[Dependencies]

python 3.8
pytorch 2.0.1
cuda 11.7
torchvision 0.15.2
einops 0.6.1
kornia 0.7.0
xformers 0.0.21
opencv-python
matplotlib

[Model Configuration]

def __init__(self, **conf):
    super().__init__()
    self.conf = {**self.default_conf, **conf}

    self.relu = nn.ReLU(inplace=True)
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
    c1, c2, c3, c4, c5 = 64, 64, 128, 128, 256

    self.conv1a = nn.Conv2d(1, c1, kernel_size=3, stride=1, padding=1)
    self.conv1b = nn.Conv2d(c1, c1, kernel_size=3, stride=1, padding=1)
    self.conv2a = nn.Conv2d(c1, c2, kernel_size=3, stride=1, padding=1)
    self.conv2b = nn.Conv2d(c2, c2, kernel_size=3, stride=1, padding=1)
    self.conv3a = nn.Conv2d(c2, c3, kernel_size=3, stride=1, padding=1)
    self.conv3b = nn.Conv2d(c3, c3, kernel_size=3, stride=1, padding=1)
    self.conv4a = nn.Conv2d(c3, c4, kernel_size=3, stride=1, padding=1)
    self.conv4b = nn.Conv2d(c4, c4, kernel_size=3, stride=1, padding=1)

    self.convPa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1)
    self.convPb = nn.Conv2d(c5, 65, kernel_size=1, stride=1, padding=0)

    self.convDa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1)
    self.convDb = nn.Conv2d(
        c5, self.conf['descriptor_dim'],
        kernel_size=1, stride=1, padding=0)

    url = "https://github.com/cvg/LightGlue/releases/download/v0.1_arxiv/superpoint_v1.pth"
    self.load_state_dict(torch.hub.load_state_dict_from_url(url))

    mk = self.conf['max_num_keypoints']
    if mk is not None and mk <= 0:
        raise ValueError('max_num_keypoints must be positive or None')

def forward(self, data: dict) -> dict:
    """ Compute keypoints, scores, descriptors for image """
    for key in self.required_data_keys:
        assert key in data, f'Missing key {key} in data'
    image = data['image']
    if image.shape[1] == 3:  # RGB
        scale = image.new_tensor([0.299, 0.587, 0.114]).view(1, 3, 1, 1)
        image = (image*scale).sum(1, keepdim=True)
    # Shared Encoder
    x = self.relu(self.conv1a(image))
    x = self.relu(self.conv1b(x))
    x = self.pool(x)
    x = self.relu(self.conv2a(x))
    x = self.relu(self.conv2b(x))
    x = self.pool(x)
    x = self.relu(self.conv3a(x))
    x = self.relu(self.conv3b(x))
    x = self.pool(x)
    x = self.relu(self.conv4a(x))
    x = self.relu(self.conv4b(x))

    # Compute the dense keypoint scores
    cPa = self.relu(self.convPa(x))
    scores = self.convPb(cPa)
    scores = torch.nn.functional.softmax(scores, 1)[:, :-1]
    b, _, h, w = scores.shape
    scores = scores.permute(0, 2, 3, 1).reshape(b, h, w, 8, 8)
    scores = scores.permute(0, 1, 3, 2, 4).reshape(b, h*8, w*8)
    scores = simple_nms(scores, self.conf['nms_radius'])

    # Discard keypoints near the image borders
    if self.conf['remove_borders']:
        pad = self.conf['remove_borders']
        scores[:, :pad] = -1
        scores[:, :, :pad] = -1
        scores[:, -pad:] = -1
        scores[:, :, -pad:] = -1

    # Extract keypoints
    best_kp = torch.where(scores > self.conf['detection_threshold'])
    scores = scores[best_kp]

    # Separate into batches
    keypoints = [torch.stack(best_kp[1:3], dim=-1)[best_kp[0] == i]
                 for i in range(b)]
    scores = [scores[best_kp[0] == i] for i in range(b)]

    # Keep the k keypoints with highest score
    if self.conf['max_num_keypoints'] is not None:
        keypoints, scores = list(zip(*[
            top_k_keypoints(k, s, self.conf['max_num_keypoints'])
            for k, s in zip(keypoints, scores)]))

    # Convert (h, w) to (x, y)
    keypoints = [torch.flip(k, [1]).float() for k in keypoints]

    # Compute the dense descriptors
    cDa = self.relu(self.convDa(x))
    descriptors = self.convDb(cDa)
    descriptors = torch.nn.functional.normalize(descriptors, p=2, dim=1)

    # Extract descriptors
    descriptors = [sample_descriptors(k[None], d[None], 8)[0]
                   for k, d in zip(keypoints, descriptors)]

    return {
        'keypoints': torch.stack(keypoints, 0),
        'keypoint_scores': torch.stack(scores, 0),
        'descriptors': torch.stack(descriptors, 0).transpose(-1, -2).contiguous(),
    }

  • DISK (feature extraction part)

def __init__(self, features='superpoint', **conf) -> None:
    super().__init__()
    self.conf = {**self.default_conf, **conf}
    if features is not None:
        assert (features in list(self.features.keys()))
        self.conf['weights'], self.conf['input_dim'] = \
            self.features[features]
    self.conf = conf = SimpleNamespace(**self.conf)

    if conf.input_dim != conf.descriptor_dim:
        self.input_proj = nn.Linear(
            conf.input_dim, conf.descriptor_dim, bias=True)
    else:
        self.input_proj = nn.Identity()

    head_dim = conf.descriptor_dim // conf.num_heads
    self.posenc = LearnableFourierPositionalEncoding(2, head_dim, head_dim)

    h, n, d = conf.num_heads, conf.n_layers, conf.descriptor_dim

    self.transformers = nn.ModuleList(
        [TransformerLayer(d, h, conf.flash) for _ in range(n)]
    )

    self.log_assignment = nn.ModuleList(
        [MatchAssignment(d) for _ in range(n)])
    self.token_confidence = nn.ModuleList([
        TokenConfidence(d) for _ in range(n-1)])
    self.register_buffer('confidence_thresholds', torch.Tensor([
        self.confidence_threshold(i) for i in range(self.conf.n_layers)]))

    state_dict = None
    if features is not None:
        fname = f'{conf.weights}_{self.version}.pth'.replace('.', '-')
        state_dict = torch.hub.load_state_dict_from_url(
            self.url.format(self.version, features), file_name=fname)
        self.load_state_dict(state_dict, strict=False)
    elif conf.weights is not None:
        path = Path(__file__).parent
        path = path / 'weights/{}.pth'.format(self.conf.weights)
        state_dict = torch.load(str(path), map_location='cpu')

    if state_dict:
        # rename old state dict entries
        for i in range(self.conf.n_layers):
            pattern = f'self_attn.{i}', f'transformers.{i}.self_attn'
            state_dict = {k.replace(*pattern): v for k, v in state_dict.items()}
            pattern = f'cross_attn.{i}', f'transformers.{i}.cross_attn'
            state_dict = {k.replace(*pattern): v for k, v in state_dict.items()}
        self.load_state_dict(state_dict, strict=False)

    # static lengths LightGlue is compiled for (only used with torch.compile)
    self.static_lengths = None

[Get Pre-trained Model]


[Basic Code]

  • Github에 있는 basic code 기반
from lightglue import LightGlue, SuperPoint, DISK
from lightglue.utils import load_image, rbd
import pdb

query_path = "~/query.png"
cand_path = "~/cand.png"

# SuperPoint+LightGlue
extractor = SuperPoint(max_num_keypoints=2048).eval().cuda()  # load the extractor
matcher = LightGlue(features='superpoint').eval().cuda()  # load the matcher

# or DISK+LightGlue
extractor = DISK(max_num_keypoints=2048).eval().cuda()  # load the extractor
matcher = LightGlue(features='disk').eval().cuda()  # load the matcher

# load each image as a torch.Tensor on GPU with shape (3,H,W), normalized in [0,1]
image0 = load_image(query_path).cuda()
image1 = load_image(cand_path).cuda()

# extract local features
feats0 = extractor.extract(image0)  # auto-resize the image, disable with resize=None
feats1 = extractor.extract(image1)

# match the features
matches01 = matcher({'image0': feats0, 'image1': feats1})
feats0, feats1, matches01 = [rbd(x) for x in [feats0, feats1, matches01]]  # remove batch dimension
matches = matches01['matches']  # indices with shape (K,2)
points0 = feats0['keypoints'][matches[..., 0]]  # coordinates in image #0, shape (K,2)
points1 = feats1['keypoints'][matches[..., 1]]  # coordinates in image #1, shape (K,2)

# conversion to (H, W, 3) for torch.Tensor
plt_img1 = [img.permute(1, 2, 0).cpu().numpy() if (isinstance(img, torch.Tensor) and img.dim() == 3) else img for img in image0]
plt_img2 = [img.permute(1, 2, 0).cpu().numpy() if (isinstance(img, torch.Tensor) and img.dim() == 3) else img for img in image1]

# convert torch.Tensor to numpy
np_img1 = np.zeros(shape=(480, 640, 3), dtype=float)
np_img2 = np.zeros(shape=(480, 640, 3), dtype=float)
for i in range(0, 3):
 np_img1[:, :, i] = plt_img1[i].cpu()
 np_img2[:, :, i] = plt_img2[i].cpu()

# Plot keypoint in each images
plt.scatter(points0[:,0].cpu(), points0[:,1].cpu(), c='green', s=5)
plt.imshow(np_img1)
plt.show()

plt.scatter(points1[:,0].cpu(), points1[:,1].cpu(), c='green', s=5)
plt.imshow(np_img2)
plt.show()
  • Reference Site

[Experiments]

  • View point가 다른 challenging 한 이미지 pair 3쌍을 준비
    • Test Pair 1
Query Image Candidate Image
    • Test Pair 2 (Challenging !)
Query Image Candidate Image
    • Test Pair 3
Query Image Candidate Image

[Result]

  • LightGlue에서 주어진 pre-trained model을 적용하여 해당 결과 plot (본 데이터셋으로 추가 train 시키지 않음)
  • [Case 1: SuperPoint + LightGlue]
    • 주어진 desktop 환경은 NVIDIA RTX 3060 이 탑재되어 있고 SuperPoint + LightGlue를 돌리면 약 2.5GB 정도의 GPU memory 소모
# Debug
feats0 type: <class 'dict'> -> "keypoints" "keypoint_scores" "descriptors" "image_size"
feats0["keypoints"] type: <class 'torch.Tensor'> & size: torch.Size([809, 2])
feats0["keypoint_scores"] type: <class 'torch.Tensor'> & size: torch.Size([809])
feats0["descriptors"] type: <class 'torch.Tensor'> & size: torch.Size([809, 256])
feats0["image_size"] type: <class 'torch.Tensor'> & size: torch.Size([2])

feats1 type: <class 'dict'> -> "keypoints" "keypoint_scores" "descriptors" "image_size"
feats1["keypoints"] type: <class 'torch.Tensor'> & size: torch.Size([1120, 2])
feats1["keypoint_scores"] type: <class 'torch.Tensor'> & size: torch.Size([1120])
feats1["descriptors"] type: <class 'torch.Tensor'> & size: torch.Size([1120, 256])
feats1["image_size"] type: <class 'torch.Tensor'> & size: torch.Size([2])

matches01 type: <class 'dict'> -> "matches0" "matches1" "matching_scores0" "matching_scores1" "matches" "scores" "prune0" "prune1"
matches01["matches0"] type: <class 'torch.Tensor'> & size: torch.Size([809])
matches01["matches1"] type: <class 'torch.Tensor'> & size: torch.Size([1120])
matches01["matching_scores0"] type: <class 'torch.Tensor'> & size: torch.Size([809])
matches01["matching_scores1"] type: <class 'torch.Tensor'> & size: torch.Size([1120])
matches01["matches"] type: <class 'torch.Tensor'> & size: torch.Size([94, 2])
matches01["scores"] type: <class 'torch.Tensor'> & size: torch.Size([94])
matches01["prune0"] type: <class 'torch.Tensor'> & size: torch.Size([809])
matches01["prune1"] type: <class 'torch.Tensor'> & size: torch.Size([1120])

points0 type: <class 'torch.Tensor'> & size: torch.Size([94, 2])
points1 type: <class 'torch.Tensor'> & size: torch.Size([94, 2])
    • Keypoint Detection 
    • Feature Matching Result (inlier = 94)


  • [Case 2: DISK + LightGlue]
    • 주어진 desktop 환경은 NVIDIA RTX 3060 이 탑재되어 있고 DISK + LightGlue를 돌리면 약 4.5GB 정도의 GPU memory 소모
##### max_num_keypoints=2048 으로 설정해두어서 DISK로 하는 경우에는 모두 feature가 추출되는 것을 볼 수 있음 #####

# Debug
feats0 type: <class 'dict'> -> "keypoints" "keypoint_scores" "descriptors" "image_size"
feats0["keypoints"] type: <class 'torch.Tensor'> & size: torch.Size([2048, 2])
feats0["keypoint_scores"] type: <class 'torch.Tensor'> & size: torch.Size([2048])
feats0["descriptors"] type: <class 'torch.Tensor'> & size: torch.Size([2048, 128])
feats0["image_size"] type: <class 'torch.Tensor'> & size: torch.Size([2])

feats1 type: <class 'dict'> -> "keypoints" "keypoint_scores" "descriptors" "image_size"
feats1["keypoints"] type: <class 'torch.Tensor'> & size: torch.Size([2048, 2])
feats1["keypoint_scores"] type: <class 'torch.Tensor'> & size: torch.Size([2048])
feats1["descriptors"] type: <class 'torch.Tensor'> & size: torch.Size([2048, 128])
feats1["image_size"] type: <class 'torch.Tensor'> & size: torch.Size([2])

matches01 type: <class 'dict'> -> "matches0" "matches1" "matching_scores0" "matching_scores1" "matches" "scores" "prune0" "prune1"
matches01["matches0"] type: <class 'torch.Tensor'> & size: torch.Size([2048])
matches01["matches1"] type: <class 'torch.Tensor'> & size: torch.Size([2048])
matches01["matching_scores0"] type: <class 'torch.Tensor'> & size: torch.Size([2048])
matches01["matching_scores1"] type: <class 'torch.Tensor'> & size: torch.Size([2048])
matches01["matches"] type: <class 'torch.Tensor'> & size: torch.Size([57, 2])
matches01["scores"] type: <class 'torch.Tensor'> & size: torch.Size([57])
matches01["prune0"] type: <class 'torch.Tensor'> & size: torch.Size([2048])
matches01["prune1"] type: <class 'torch.Tensor'> & size: torch.Size([2048])

points0 type: <class 'torch.Tensor'> & size: torch.Size([57, 2])
points1 type: <class 'torch.Tensor'> & size: torch.Size([57, 2])
    • Keypoint Detection
[CASE 1] max_num_keypoints = 2048
[CASE 2] max_num_keypoints = 4096
[CASE 3] max_num_keypoints = 8192 (이 경우는 7GB 정도 소모됨…)
[CASE 4] max_num_keypoints = 15000 (이 경우는 7GB 정도 소모됨…)
[CASE 5] max_num_keypoints = 20000 (이 경우는 7GB 정도 소모됨…)
    • Feature Matching Result
[CASE 1] max_num_keypoints = 2048 & matching pair = 57
[CASE 2] max_num_keypoints = 4096 & matching pair = 74
[CASE 3] max_num_keypoints = 8192 & matching pair = 128 (이 경우는 7GB 정도 소모됨…)
[CASE 4] max_num_keypoints = 15000 & matching pair = 138 (이 경우는 7GB 정도 소모됨…)
[CASE 5] max_num_keypoints = 20000 & matching pair = 139 (이 경우는 7GB 정도 소모됨…) 
  • 그 이후는 동일한 결과가 나타남…

[Ablation Study]

  • SuperPoint + LightGlue
# of inliers: 94 # of inliers: 112 # of inliers: 49
  • DISK + LightGlue (max_num_keypoints = 2048)
# of inliers: 57 # of inliers: 19 # of inliers: 74
  • DISK + LightGlue (max_num_keypoints = 4096)
# of inliers: 74 # of inliers: 15 # of inliers: 90
  • DISK + LightGlue (max_num_keypoints = 8192)
# of inliers: 128 # of inliers: 13 # of inliers: 24
  • DISK + LightGlue (max_num_keypoints = 15000)
# of inliers: 138 # of inliers: 18 # of inliers: 12
  • DISK + LightGlue (max_num_keypoints = 20000)
# of inliers: 139 # of inliers: 18 # of inliers: 12

 

+ Recent posts