Bash script to download and search youtube subtitles and output clickable timestamped urls

cross-posted from: https://lemm.ee/post/23155648
Here is the script.

#!/usr/bin/env bash
# Download and search youtube subs
# deps yt-dlp ,awk, perl, any one or more of either ugrep, ripgrep, grep
# usage "script youtube_url"


main() {
    url="$@"
    check_if_url
    get_video_id
    search_for_downloaded_matching_files
    set_download_boolean_flag
    download_subs
    read_and_format_transcript_file
    echo_description_file
    user_search
}


# Iterate over the array and add items to the new array if they match the regex
check_if_url() {
    local regex='^https://[^[:space:]]+$'
        if ! [[ $url =~ $regex ]]; then
            echo "Invalid input. Valid input is a url matching regex ${regex}"
            exit 1
        fi
}


get_video_id() {
    video_id=$(echo "$url" | sed -n 's/.*v=\([^&]*\).*/\1/p')
}


search_for_downloaded_matching_files() {
    # Find newest created files matching the video_id
    transcript_file="$(  /usr/bin/ls -t --time=creation "$PWD"/*${video_id}*\.vtt 2>/dev/null | head -n 1  )"
    description_file="$(  /usr/bin/ls -t --time=creation "$PWD"/*${video_id}*\.description 2>/dev/null | head -n 1  )"
}


set_download_boolean_flag() {
    if [ -n "$transcript_file" ] && [ -n "$description_file" ]; then
        download=0 # FALSE
    else
        download=1 # TRUE
    fi
}


download_subs() {
    if [ "$download" -eq 1 ]; then
        yt-dlp --restrict-filenames --write-auto-sub --skip-download "${url}"
        yt-dlp --restrict-filenames --sub-langs=eng --write-subs --skip-download "${url}"
        yt-dlp --restrict-filenames --write-description --skip-download "${url}"
        # Search files again since they were just downloaded
        search_for_downloaded_matching_files
    fi
}


read_and_format_transcript_file() {
    perl_removed_dupes="$(perl -0777 -pe 's/^\d\d.*\n.*\n.*<\/c>//gm' <"${transcript_file}")"
    local prefix="https://www.youtube.com/watch?v=${video_id}&t="
    local suffix="s"
    formated_transcript_file="$(awk -v pre="$prefix" -v suf="$suffix" '
    /^([0-9]{2}:){2}[0-9]{2}\.[0-9]{3}/ {
        split($1, a, /[:.]/);
        $1 = pre (int(a[1]*3600 + a[2]*60 + a[3]) - 3) suf;
        sub(/ --> [0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3}/, "");
        sub(/ align:start position:0%$/, "");
        print;
        next;
    }
    {
        sub(/ align:start position:0%$/, "");
        print;
    }
    ' <<<"${perl_removed_dupes}")"
    #CRLF for ugrep to avoid ?bug? where before lines are not all outputted
    formated_transcript_file_CRLF=$(printf '%b' "$formated_transcript_file" | sed 's/$/\r/')
}


echo_description_file() {
    cat "${description_file}"
}


user_search() {
    echo -e "\n\n"
    read -rp "Enter regex (read as raw input): " search_term

    : ${app_count:=0}

    if command -v ug >/dev/null 2>&1; then
        echo -e "\n\n\n\n"
        echo "Ugrep output"
        ug --pretty=never -B2 -A1 -i -Z+-~1 -e "${search_term}" --andnot "^https?:\/\/"  <<<"$formated_transcript_file_CRLF"
        ((app_count++))
    fi

    if command -v rg >/dev/null 2>&1; then
        echo -e "\n\n\n\n"
        echo "Ripgrep output"
        rg -iP -B2 -A7 "^(?!https?:\/\/).*\K${search_term}" <<<"$formated_transcript_file"
        ((app_count++))
    fi
    
    if [ "$app_count" -eq 0 ]; then
        echo -e "\n\n\n\n"
        echo "Grep output"
        grep -iP -B2 -A1 "${search_term}" <<<"$formated_transcript_file"
        echo -e "\n\n"
        echo "Consider installing ripgrep and ugrep for better search"
        ((app_count++))
    fi
}


main "$@"
Bash script to download and search youtube subtitles and output clickable timestamped urls

Bash

!bash@lemmy.ml

Community stats

Community moderators