Untitled

 avatar
unknown
plain_text
5 months ago
12 kB
3
Indexable
import os

from pydub import AudioSegment
from pydub.utils import mediainfo
from moviepy.editor import VideoFileClip
import logging

import librosa
import librosa.display

import numpy as np

from scipy import signal
from scipy.signal import find_peaks
from scipy.interpolate import interp1d

import matplotlib.pyplot as plt
from matplotlib import cm


def loadAudio(file_path, return_segment=False):
    audio_segment = AudioSegment.from_file(file_path)
    data = np.array(audio_segment.get_array_of_samples())
    if audio_segment.channels > 1:
        data = data.reshape((-1, audio_segment.channels)).mean(axis=1)  # Convert to mono if stereo
    rate = audio_segment.frame_rate

    if return_segment:
        return data, rate, audio_segment
    else:
        return data, rate


def findOffset2(audio1, audio2):
    # Perform the cross-correlation between the two audio signals
    correlation = signal.correlate(audio2, audio1, mode='full')
    
    # Find the index of the maximum correlation value
    max_index = np.argmax(correlation)
    
    # Calculate the lag: the maximum correlation index minus the length of the first signal
    # Adjust by subtracting one less than the length of audio1 to get the correct lag
    lag = max_index - (len(audio1) - 1)

    return lag

def findOffset(audio1, audio2):
    correlation = signal.correlate(audio2, audio1, mode="full")
    lags = signal.correlation_lags(audio2.size, audio1.size, mode="full")
    lag = lags[np.argmax(correlation)]
    
    return lag



def extractAudioFromAudioFile(inputFile, outputFile, startTimeInSeconds, endTimeInSeconds):
    audio = AudioSegment.from_file(inputFile)
    
    # Convert start and end times from seconds to milliseconds
    start_ms = int(startTimeInSeconds * 1000)
    end_ms = int(endTimeInSeconds * 1000)

    # Ensure end_ms does not exceed the actual duration of the audio
    actual_duration_ms = len(audio)
    end_ms = min(end_ms, actual_duration_ms)

    # Extract the segment from start to end time
    seg = audio[start_ms:end_ms]

    # Retrieve the bitrate information from the original file
    bitrate = mediainfo(inputFile)['bit_rate']

    # Export the extracted segment
    seg.export(outputFile, format="mp3", bitrate=bitrate)



def extractAudioFromVideoFile(inputFile, outputFile, startTimeInSeconds, endTimeInSeconds):
    video = VideoFileClip(inputFile)
    
    if endTimeInSeconds > video.duration:
        endTimeInSeconds = video.duration

    clipped_audio = video.audio.subclip(startTimeInSeconds, endTimeInSeconds)
    
    clipped_audio.write_audiofile(outputFile, codec="pcm_s16le", bitrate="320k")



import subprocess

def extractAudioUsingFFmpeg(inputFile, outputFile, startTimeInSeconds, endTimeInSeconds, audio_delay_seconds=-0.002):  # Default to 2ms
    command = [
        'ffmpeg',
        '-y',  # Overwrite output files without asking
        '-v', 'error',  # Show only errors
        '-itsoffset', str(audio_delay_seconds),  # Offset for audio, adjusts sync
        '-i', inputFile,  # Input file
        '-ss', str(startTimeInSeconds),  # Start time for audio extraction
        '-to', str(endTimeInSeconds),  # End time for audio extraction
        '-vn',  # No video output
        '-acodec', 'pcm_s16le',  # Set audio codec for WAV output (or use 'copy' for original codec)
        '-ar', '44100',  # Audio sample rate
        outputFile  # Output file
    ]

    # Execute the FFmpeg command
    process = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    # Check for errors
    if process.returncode != 0:
        print(f"Error: {process.stderr}")
    else:
        print("Audio extraction completed successfully.")



def adjustAudio(audio_segment, lag, frame_rate):
    # Convert lag from samples to milliseconds, rounding to nearest integer at the last step
    ms_lag = round((lag / frame_rate) * 1000)
    
    if lag > 0:
        # Audio needs to start later: pad audio at the beginning
        silence = AudioSegment.silent(duration=ms_lag, frame_rate=frame_rate)
        adjusted_audio = silence + audio_segment
    else:
        # Audio needs to start earlier: trim audio from the beginning
        adjusted_audio = audio_segment[abs(ms_lag):]  # Use abs to convert negative lag to positive

    return adjusted_audio



def alignAudioTrack(audioFile, newAudioFile, lag):
    audio, rate, audio_segment = loadAudio(audioFile, return_segment=True)

    # Adjust the AudioSegment based on lag, ensuring frame_rate is passed correctly
    adjusted_audio = adjustAudio(audio_segment, lag, rate)

    # Save the adjusted audio in 32-bit floating-point WAV format
    adjusted_audio.export(newAudioFile, format="wav", codec="pcm_f32le")



def normalize_audio2(audio, amp_factor=1.0):
    audio = audio / np.max(np.abs(audio)) * amp_factor
    return audio



def downsample(data, num_samples):
    indices = np.linspace(0, len(data) - 1, num_samples, dtype=int)
    return data[indices], indices



def plotWaveforms(audioFile1, audioFile2, imageFile, startSegment=30, endSegment=40, clip_negative=True, num_samples=5000, barWidth=1):
    y1, sr1 = librosa.load(audioFile1, sr=None, offset=startSegment, duration=endSegment - startSegment)
    y2, sr2 = librosa.load(audioFile2, sr=None, offset=startSegment, duration=endSegment - startSegment)
    
    # Normalize and adjust the amplitude of the audio signals
    y1 = normalize_audio2(y1, 1)
    y2 = normalize_audio2(y2, 1)

    # Downsample data
    y1, indices1 = downsample(y1, num_samples)
    y2, indices2 = downsample(y2, num_samples)
    t1 = np.linspace(0, len(y1) / sr1, num_samples)
    t2 = np.linspace(0, len(y2) / sr2, num_samples)

    # Apply clip_negative if True
    if clip_negative:
        y1 = np.clip(y1, 0, None)
        y2 = np.clip(y2, 0, None)

    # Create a figure with a black background
    fig, ax = plt.subplots(figsize=(14, 1), facecolor='black')

    # Set bar width based on the barWidth multiplier
    bar_width = (t1[1] - t1[0]) * barWidth

    # Plot amplitudes as bars centered on zero
    ax.bar(t1, y1, width=bar_width, color=(0.55, 0, 0.8), alpha=1, bottom=-y1/2)
    ax.bar(t2, y2, width=bar_width, color=(1, 0.6, 0), alpha=1, bottom=-y2/2)

    # Set x and y-axis limits to fit the data range precisely, ensuring symmetric around zero
    ax.set_xlim(t1[0] - bar_width/2, t1[-1] + bar_width/2)
    max_abs_val = max(np.max(np.abs(y1 - y1/2)), np.max(np.abs(y2 - y2/2)))
    ax.set_ylim(-max_abs_val, max_abs_val)

    # Add a center line
    ax.axhline(y=0, color=(1, 0.6, 0), linewidth=0.5)

    # Remove axes, labels, and title for a clean look
    ax.axis('off')

    # Adjust the subplot padding
    fig.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05)

    # Save the figure with a specific resolution
    plt.savefig(imageFile, format='png', dpi=300, bbox_inches='tight', pad_inches=0.05)
    plt.close()



def normalize_audio(audio):
    audio = audio / np.max(np.abs(audio))
    audio[audio < 0] = 0  # Clamp negative values to zero
    return audio

def sample_peaks(y, num_points=100):
    peaks, _ = find_peaks(y)
    if len(peaks) > num_points:
        selected_peaks = np.linspace(0, len(peaks) - 1, num_points, dtype=int)
        peaks = peaks[selected_peaks]
    return peaks

def plotWaveforms2(audioFile1, audioFile2, imageFile, startSegment=30, endSegment=31, num_points=441, line_thickness=1):
    y1, sr1 = librosa.load(audioFile1, sr=None, offset=startSegment, duration=endSegment - startSegment)
    y2, sr2 = librosa.load(audioFile2, sr=None, offset=startSegment, duration=endSegment - startSegment)
    y1, y2 = normalize_audio(y1), normalize_audio(y2)

    print(len(y1))
    print(len(y2))
    peaks1 = sample_peaks(y1, num_points)
    peaks2 = sample_peaks(y2, num_points)

    t1 = np.linspace(0, len(y1) / sr1, len(y1))[peaks1]
    t2 = np.linspace(0, len(y2) / sr2, len(y2))[peaks2]

    f1 = interp1d(t1, y1[peaks1], kind='cubic', fill_value="extrapolate", bounds_error=False)
    f2 = interp1d(t2, y2[peaks2], kind='cubic', fill_value="extrapolate", bounds_error=False)

    fine_t1 = np.linspace(t1.min(), t1.max(), 500)
    fine_t2 = np.linspace(t2.min(), t2.max(), 500)

    plt.figure(figsize=(16, 1), facecolor='black')
    plt.plot(fine_t1, f1(fine_t1), color='green', alpha=1, linewidth=line_thickness)
    plt.plot(fine_t2, f2(fine_t2), color='orange', alpha=1, linewidth=line_thickness)

    plt.fill_between(fine_t1, 0, f1(fine_t1), color='green', alpha=0.5)
    plt.fill_between(fine_t2, 0, f2(fine_t2), color='orange', alpha=0.5)

    plt.axis('off')
    plt.savefig(imageFile, format='png', dpi=300, bbox_inches='tight', pad_inches=0.1)
    plt.close()




duration = 300 #seconds
limit = None

# Process all audio files in a directory
audioDir = "N:/RDX/Gumroad/Tutorials/VEX Volume 2/Audio/Poly Carve/1 original"
wavDir = "N:/RDX/Gumroad/Tutorials/VEX Volume 2/Audio/Poly Carve/_audio from video"
videoDir = "N:/RDX/Gumroad/Tutorials/VEX Volume 2/Chapters/Poly Carve/Video"
newAudioDir = "N:/RDX/Gumroad/Tutorials/VEX Volume 2/Audio/Poly Carve/2 synced"
imgDir = "N:/RDX/Gumroad/Tutorials/VEX Volume 2/Audio/Poly Carve/_img"
tempDir = os.path.join(os.path.dirname(audioDir), "_tracks")



processed_count = 0  # Counter for the number of files processed


# Create directories if not present
os.makedirs(newAudioDir, exist_ok=True)
#os.makedirs(imgDir, exist_ok=True)
#os.makedirs(tempDir, exist_ok=True)



for filename in os.listdir(videoDir):
    if filename.endswith(".trec"):
        if limit is not None and processed_count >= limit:
            print(f"Limit reached: processed {processed_count} files.")
            break

        baseName, _ = os.path.splitext(filename)
        origAudioFile = os.path.join(audioDir, f"{baseName}.mp3")
        refAudioFile = os.path.join(wavDir, f"{baseName}.wav")
        alignedAudioFile = os.path.join(newAudioDir, f"{baseName}.wav")
        
        if not os.path.exists(origAudioFile):
            print(f"Matching MP3 not found: {filename}.")
            continue


        audioFile1 = os.path.join(tempDir, f"{baseName}_audio.mp3")
        audioFile2 = os.path.join(tempDir, f"{baseName}_video.wav")
        if not os.path.exists(alignedAudioFile):
            # Extract audio tracks
            #if not os.path.exists(audioFile1):
             #   extractAudioFromAudioFile(origAudioFile, audioFile1, 0, 300)

            #if not os.path.exists(audioFile2):
             #   trecFile = os.path.join(videoDir, filename)
             #   extractAudioFromVideoFile(trecFile, audioFile2, 0, 300)

                #base_dir = os.path.dirname(trecFile)
                #temp_filename = os.path.join(base_dir, "temp_extract_audio.mp4")

                # Rename the original .trec file to .mp4 temporarily
                #os.rename(trecFile, temp_filename)

                #extractAudioUsingFFmpeg(temp_filename, audioFile2, 0, 300)

                #os.rename(temp_filename, trecFile)


            # Compute offset
            audio1, rate1 = loadAudio(origAudioFile)
            audio2, rate2 = loadAudio(refAudioFile)

            #moviepy introduces some delay but it's fixed so we can hardcode it
            #delay = 1055
            lag = findOffset2(audio1, audio2)
            print(f"audio: {baseName} lag: {lag}")

            # Align original audio track using the same lag value in samples
            alignAudioTrack(origAudioFile, alignedAudioFile,  lag)
        #else:
         #   print(f"Skipping {filename}, aligned file already exists.")
        

        # Plot waveforms
        #plotImageFile = os.path.join(imgDir, f"{baseName}.png")
        #plotWaveforms(audioFile2, alignedAudioFile, plotImageFile)



        processed_count += 1



#verifyAudioAlignment(newAudioDir, audioDir)
#audio1, rate1 = loadAudio("N:/RDX/Gumroad/Tutorials/VEX Volume 2/Audio/Curvature/_tracks/_manual.wav")
#audio2, rate2 = loadAudio("N:/RDX/Gumroad/Tutorials/VEX Volume 2/Audio/Curvature/_tracks/curvature 01_video.wav")
#lag = findOffset(audio1, audio2)
#print(lag)
Editor is loading...
Leave a Comment