Face Detection in a Video with Computer Vision

Detecting faces in a video is not much different from analyzing an image for faces. The difference lies in extracting each image frame of the video and processing for detecting faces instead of a single image. However, in many video formats, this process of extraction needs to begin from the starting frame of a video due to the way their encoding algorithms are designed, making each next frame's data dependent on the previous one in timeline.

Detecting faces in a Video

#!/usr/bin/env python3

"""
Detect faces in a given video and create new video with faces marked.
"""

import cv2
import sys

def process_video(video_name, capture_frequency):
  vidcap = cv2.VideoCapture(video_name)
  success,current_image_frame = vidcap.read()
  frame_num = 0
  print("Extracting once every {0} frames ...".format(capture_frequency))

  # Create classifier based on haarcascade
  face_classifier = cv2.CascadeClassifier("haarcascade_frontalface_default.xml")
  while success:
    if frame_num % capture_frequency == 0:
      #print("Processing frame #:", frame_num)
      
      image = cv2.resize(current_image_frame, (0,0), fx=0.8, fy=0.8)
      gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
      # Detect faces in image
      faces = face_classifier.detectMultiScale(
        gray,
        scaleFactor=1.1,
        minNeighbors=5,
        minSize=(80, 80)
      )
    
      #print("Found {0} faces".format(len(faces)))
    
      # Draw rectangles around the faces
      for (x, y, w, h) in faces:
        cv2.rectangle(image, (x, y), (x+w, y+h), (0, 255, 0), 2)
    
      cv2.imshow("Faces found", image)
      #status = cv2.imwrite("{0}_detected.{1}".format(image_path.split(".")[0], image_path.split(".")[1]), image)
      #print ("Image written to file-system : ", status)
      cv2.waitKey(0)

    success,current_image_frame = vidcap.read()
    #print('Read a new frame: ', success)
    frame_num += 1
  #print("Generating video ...")
  #generate_video()
  #exit()

def main():
  import argparse
  ap = argparse.ArgumentParser()
  ap.add_argument("-v", "--vid", required=True,
                      help="Path to input video")
  ap.add_argument("-f", "--frequency", type=int, default=15, 
                      help="Process once every n frames (integer)") 
                      # [10, 15] enough for ~24 fps videos for near-realtime
  args = vars(ap.parse_args())
  process_video(args["vid"], args["frequency"])

if __name__ == "__main__":
  main()

Several parameters above could be adjusted to better suit the scenario. To save each processed frame (with rectanges drawn around detected faces) in current directory (check with pwd), you can uncomment line 55 (status = cv2.imwrite…). This almost goes without saying there is still much room for improvement here.