panorama-from-scratch
Jupyter Notebook
Making a panorama from scratch
Nowadays, our phone cameras are well calibrated, with the most advanced feature detection and matching techniques inbuilt so we can make panoramas without needing to think. But thinking is fun, so why not take a walk back in time to when panorama making was a multi step puzzle using SIFT, RANSAC & Projective Transformations
from pathlib import Path
import cv2 as cv
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
%matplotlib inline
images = [cv.imread(path) for path in Path("./images").glob("*.png")]
def show_image(img: np.ndarray) -> None:
img_rgb = cv.cvtColor(img, cv.COLOR_BGR2RGB)
# Display the image using Matplotlib
plt.imshow(img_rgb)
plt.axis("off") # Hide axes
plt.show()
def find_im_corners(frame_shape: tuple[int, int], H: np.ndarray) -> np.ndarray:
h1, w1, _ = frame_shape
original_size = h1 * w1
im1_corners = np.float32([[0, 0], [0, h1], [w1, h1], [w1, 0]]).reshape(-1, 1, 2)
corner_locations = cv.perspectiveTransform(im1_corners, H)
(xmin, ymin) = np.int32(np.floor(corner_locations.min(axis=0).ravel()))
(xmax, ymax) = np.int32(np.ceil(corner_locations.max(axis=0).ravel()))
im_size = (xmax - xmin) * (ymax - ymin)
if abs(im_size) < original_size * 10:
return corner_locations
return im1_corners
for image in images:
show_image(image)




Calculate key points and descriptors with SIFT (Scale Invariant Feature Transform)
frame_bw = [cv.cvtColor(im, cv.COLOR_BGR2GRAY) for im in images]
sift = cv.SIFT_create()
# sift.detectAndCompute returns a tuple of keypoints & descriptors
sift_features = [sift.detectAndCompute(im, None) for im in tqdm(frame_bw)]
100%|██████████| 4/4 [00:08<00:00, 2.05s/it]
Match Features Between Frames using FLANN (Fast Library for Approximate Nearest Neighbours)
FLANN_INDEX_KDTREE = 1
index_params = {"algorithm": FLANN_INDEX_KDTREE, "trees": 5}
search_params = {"checks": 50}
threshold = 0.55 # How much difference between features we will accept
flann = cv.FlannBasedMatcher(index_params, search_params)
def match_features(
descriptor1: np.ndarray, descriptor2: np.ndarray
) -> list[list[cv.DMatch]]:
results = flann.knnMatch(descriptor1, descriptor2, k=2)
distance_mat = np.array([[n1.distance, n2.distance] for n1, n2 in results])
match_mat = distance_mat[:, 0] < threshold * distance_mat[:, 1]
# Bin all good matches and find the image which has the most matches
return [[result[0]] for i, result in enumerate(results) if match_mat[i]]
matches = [
match_features(sift_features[i][1], sift_features[i + 1][1])
for i in tqdm(range(len(images) - 1))
]
100%|██████████| 3/3 [00:10<00:00, 3.53s/it]
img3 = cv.drawMatchesKnn(
images[0],
sift_features[0][0],
images[1],
sift_features[1][0],
matches[0],
None,
flags=cv.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS,
)
show_image(img3)

Find the homography that projects one image onto the other using RANSAC (RANdom SAmple Consensus)
i = 0
homography_set = []
for i in tqdm(range(len(matches))):
key_points1 = sift_features[i][0]
key_points2 = sift_features[i + 1][0]
src_points = np.float32(
[key_points1[m[0].queryIdx].pt for m in matches[i]]
).reshape(-1, 1, 2)
dst_points = np.float32(
[key_points2[m[0].trainIdx].pt for m in matches[i]]
).reshape(-1, 1, 2)
homography, _ = cv.findHomography(src_points, dst_points, cv.RANSAC, 5.0)
homography_set.append({"H": homography, "idx": i})
100%|██████████| 3/3 [00:00<00:00, 85.79it/s]
This part is just to make the final panorama nicer
Right now, all the homographies stitch one image to the following image, we can use these one after another to make a panorama, but this will create a weird perspective where all frames are projected to the perspective of the last image. Instead, we can do some simple projection to make all the homographies project to the second or third image, which will look nicer.
# I will project all to the POV of picture 3
H_13 = homography_set[0]["H"] @ homography_set[1]["H"]
H_23 = homography_set[1]["H"]
H_33 = np.eye(3)
H_43 = np.linalg.inv(homography_set[2]["H"]) # The inverse of H_34 is H_43
homography_set = [
{"H": H_13, "idx": 0},
{"H": H_23, "idx": 1},
{"H": H_33, "idx": 2},
{"H": H_43, "idx": 3},
]
More stuff to make the final result look nicer
By default we would be projecting other images onto image 3, but the image will still be the same size as image 3 unless we do something to change that. To see the whole panorama, we can find where the corners of each image get projected to, and get the maximum and minimum values for each corner. We can then create a translation matrix we can use to project our homographies to this new image size.
all_corners = [find_im_corners(images[0].shape, item["H"]) for item in homography_set]
all_points = np.concatenate(all_corners, axis=0)
(xmin, ymin) = np.int32(np.floor(all_points.min(axis=0).ravel()))
(xmax, ymax) = np.int32(np.ceil(all_points.max(axis=0).ravel()))
translation = np.array([[1, 0, -xmin], [0, 1, -ymin], [0, 0, 1]])
visual_homographies = [
{"idx": item["idx"], "H": translation @ item["H"]} for item in homography_set
]
Now we can project our images!
centered_frames = [
cv.warpPerspective(images[item["idx"]], item["H"], (xmax - xmin, ymax - ymin))
for item in visual_homographies
]
for img in centered_frames:
show_image(img)




Lastly, we need to combine our frames, and blend them to make it more visually appealing
We can do this by overlaying all of our images on eachother because they are all the same image size now. Then we can take the average pixel intensity whenever there are overlapping pixels (There are other ways of blending, but this is easiest).
def blend_all_images(image_list: list[np.ndarray]) -> np.ndarray:
erosion_kernel = np.ones((5, 5), np.uint8)
image_masks = [
cv.erode(
cv.threshold(cv.cvtColor(image, cv.COLOR_BGR2GRAY), 1, 1, cv.THRESH_BINARY)[
1
],
erosion_kernel,
)
for image in image_list
]
weighting_mask = np.sum(image_masks, axis=0)
weighting_mask[weighting_mask == 0] = 1 # Avoid zero divide
blended_image = np.zeros_like(image_list[0], dtype=np.float32)
for i, mask in tqdm(enumerate(image_masks)):
weighted_mask = mask / weighting_mask
mask_3_channel = np.stack([weighted_mask, weighted_mask, weighted_mask], axis=2)
blended_image += mask_3_channel * image_list[i].astype(np.float32)
return np.clip(blended_image, 0, 255).astype(np.uint8)
blended_panorama = blend_all_images(centered_frames)
show_image(blended_panorama)
4it [00:08, 2.15s/it]

Another simple approach is to just pick one pixel wherever there is an overlap
def blend_first_pixel(image_list: list[np.ndarray]) -> np.ndarray:
blended_image = np.zeros_like(image_list[0])
mask = np.zeros(image_list[0].shape[:2], dtype=bool)
for img in image_list:
img_mask = np.any(img != 0, axis=2)
update_mask = img_mask & (~mask)
for c in range(3): # for each channel
blended_image[..., c][update_mask] = img[..., c][update_mask]
mask |= update_mask # update mask to mark filled pixels
return blended_image
selective_panorama = blend_first_pixel(centered_frames)
show_image(selective_panorama)
