sound-art/main.py

import librosa
import numpy as np
import matplotlib.pyplot as plt
import scipy.io.wavfile as wavfile
import plotly.express as px


def load_audio(filename: str, mono: bool = False) -> (np.ndarray, int):
    audio, sr = librosa.load(filename, sr=None, mono=mono)
    # sr=None ensures original sampling rate is used
    return audio, sr


def get_fourier_transform(audio: np.ndarray, sr: int, n: int = None) -> np.ndarray:
    # Compute the Fourier Transform
    fft_data = np.fft.fft(audio, n=n)

    # Compute the magnitude (amplitude) of the complex numbers
    fft_magnitude = np.abs(fft_data)

    # Create frequency bins
    freq = np.fft.fftfreq(len(fft_data), d=1 / sr)

    return freq, fft_magnitude


def plot_audio(audio: np.ndarray, sr: int, secs: int = 10) -> None:
    if audio.ndim == 1:
        audio = np.array([audio, audio])
    sample = audio[:, : (sr * secs)]
    fig, ax = plt.subplots()
    ax.plot(sample[0], label="L", c="xkcd:azure", alpha=0.5)
    ax.plot(sample[1], label="R", c="xkcd:orange", alpha=0.5)
    plt.savefig(f"out/{secs:03d}.png")


def plot_fourier(audio, secs, sr):
    print("fourier")
    freq, fft_magnitude = get_fourier_transform(audio[0, : (secs * sr)], sr)
    plt.figure(figsize=(10, 6))
    plt.bar(freq, fft_magnitude, width=1.0)  # Using bar plot to mimic histogram
    plt.xlabel("Frequency (Hz)")
    plt.ylabel("Magnitude")
    plt.xscale("log")  # Logarithmic scale for frequency axis
    plt.title("Frequency Spectrum")
    plt.savefig(f"out/fourier-{secs:03d}.png")


def get_spectra(audio, secs, sr):
    print("spectra")
    # NFFT: Number of data points used in each block for the FFT. A larger value provides better frequency resolution but worse time resolution. 2048 is a common choice.
    # noverlap: The number of points of overlap between blocks. 1024 is half of NFFT, which is a common choice.
    # Plotting the Spectrogram using plt.specgram
    plt.figure(figsize=(10, 6))
    Pxx, freqs, bins, im = plt.specgram(
        audio[1, : (secs * sr)],
        Fs=sr,
        NFFT=2048,
        noverlap=1024,
        cmap="viridis",
        scale="dB",
    )
    plt.xlabel("Time (s)")
    plt.ylabel("Frequency (Hz)")
    plt.colorbar(im).set_label("Intensity (dB)")
    plt.title("Spectrogram")
    plt.savefig(f"out/spectra-{secs:03d}.png")

    return Pxx, freqs, bins, im


def reconstruct_signal(
    freq, mag, sr: int = 44100, secs: int = 1, normalize: bool = True
):
    # Sample rate and duration of the original audio
    duration = secs  # seconds

    # Create an array for time
    t = np.linspace(0, duration, duration * sr, endpoint=False)

    # Initialize the reconstructed signal as zeros
    reconstructed_signal = np.zeros(len(t))

    # Add each dominant frequency as a sine wave
    for freq, magnitude in zip(freq, mag):
        # Generate a sine wave for the dominant frequency
        sine_wave = magnitude * np.sin(2 * np.pi * freq * t)

        # Add the sine wave to the reconstructed signal
        reconstructed_signal += sine_wave

    # Normalize the reconstructed signal (optional)
    if normalize:
        reconstructed_signal /= np.max(np.abs(reconstructed_signal))

    return t, reconstructed_signal


def save_sound(wave: np.ndarray, sr: int = 44100, filename: str = "out.mp3"):
    wavfile.write(filename, sr, wave.astype(np.float32))


def get_dominant(freq: np.ndarray, fft_magnitude: np.ndarray, N: int = 5):
    # sr / 2 is the number of frequencies we can resolve (Nyquist Theorem)
    N = min(N, len(freq))
    # these frequencies we can resolve are the positive ones.
    # negative ones contain phase information (ignored for now)
    pos_fft_magnitude = fft_magnitude[freq > 0]
    pos_freq = freq[freq > 0]
    top_indices = np.argsort(pos_fft_magnitude)[::-1][:N]
    dominant_frequencies = pos_freq[top_indices]
    dominant_magnitudes = pos_fft_magnitude[top_indices]

    return dominant_frequencies, dominant_magnitudes, top_indices


def custom_specgram(audio, sr, NFFT: int = 2048, noverlap: int = 1024):
    step = NFFT - noverlap
    segments = int((len(audio) - noverlap) / step)

    specgram = np.zeros((1 + NFFT // 2, segments))

    for i in range(segments):
        # Extract the segment
        start = i * step
        end = start + NFFT
        segment = audio[start:end]

        # Apply FFT
        fft_segment = np.fft.fft(segment, n=NFFT)

        # Extract positive frequencies
        pos_fft = fft_segment[: NFFT // 2 + 1]

        # Store the magnitudes
        specgram[:, i] = np.abs(pos_fft)

    fft_freq = np.fft.fftfreq(NFFT, d=1 / sr)
    fft_freq = fft_freq[: NFFT // 2 + 1]
    return specgram, fft_freq


if __name__ == "__main__":
    filename = "clip.mp3"
    audio, sr = load_audio(filename)
    print(audio)
    print(sr)
    # secs = len(audio[0]) // sr
    secs = 10  # number of seconds
    print(f"Analyzing {filename} ({secs} seconds)")

    plot_audio(audio, sr, secs=secs)
    # plot_fourier(audio, secs, sr)
    vol, freqs, time, im = get_spectra(audio, secs, sr)

    # take the logarithm to map data to exponents. new range approx (-50, 0)
    logvol = np.log(vol)
    # rescale it to 0 - 1 (relative volume)
    logvol_scaled = (logvol - logvol.min()) / (logvol.max() - logvol.min())
    print(audio.shape, vol.shape, freqs.shape, time.shape)
    print(time)
    print(freqs)
    full_vol = logvol_scaled.sum(axis=1)

    # Analysis of frequencies in section of song (as a whole, not over-time)
    freq, fft_magnitude = get_fourier_transform(audio[0, : (secs * sr)], sr)

    N = 10
    dominant_frequencies, dominant_magnitudes, _ = get_dominant(
        freq, fft_magnitude, N=N
    )

    # Reconstruct signal from dominant frequencies
    t, new_sig = reconstruct_signal(
        dominant_frequencies, dominant_magnitudes, sr=sr, secs=secs
    )
    save_sound(new_sig, sr=sr, filename="reconstructed_audio.mp3")

    # Image
    fig, ax = plt.subplots()
    ax.set_title("Reconstructed signal")
    ax.plot(t, new_sig)
    fig.savefig(f"out/reconstructed_signal_{secs}s.png")
    fig = px.line(x=t, y=new_sig, title="Reconstructed signal")
    fig.write_image(f"out/reconstructed_signal_{secs}s.png")
    fig.write_html(f"out/index.html")

    print(f"Using top {N} frequencies ({100*N/sr/2:2.2f}%)")
    # Print the dominant frequencies and their magnitudes
    for i in range(N):
        print(
            f"Dominant Frequency {i + 1}: {dominant_frequencies[i]} Hz, Magnitude: {dominant_magnitudes[i]}"
        )

    fig, ax = plt.subplots()
    # ax.bar(pos_freq, pos_fft_magnitude)
    ax.bar(
        dominant_frequencies,
        dominant_magnitudes / dominant_magnitudes.max(),
        color="red",
    )
    ax.set_title(f"Dominant Frequencies from first {secs}s")
    fig.savefig("out/dominant.png")

    secs = 10
    sp, f = custom_specgram(audio[0, : (secs * sr)], sr)
    fig, ax = plt.subplots()
    sp = np.log(sp)
    ax.imshow(sp, aspect="auto", origin="lower")
    fig.savefig(f"out/spec-{secs}.png")

    N = 10
    # Now getting it over each time slice
    fig, ax = plt.subplots()
    for i in range(sp.shape[1]):
        # ax.bar(pos_freq, pos_fft_magnitude)
        dominant_frequencies, dominant_magnitudes, _ = get_dominant(f, sp[:, i], N=N)
        _ = ax.bar(
            dominant_frequencies, dominant_magnitudes / dominant_magnitudes.max()
        )
    ax.set_title(f"Dominant Frequencies over time")
    fig.savefig("out/time_dominant.png")

    # Let's make an animation. Each frame is a time slice
    # get viridis colors
    import matplotlib.cm as cm

    cmap = cm.get_cmap("viridis")
    # assign each frequency a color (f)
    top_freq, _, top_indices = get_dominant(f, sp.sum(axis=1), N=50)
    sort_idx = np.argsort(top_freq)
    # frequencies correspond to colors.
    colors = cmap(top_freq[sort_idx] / top_freq.max())

    def make_frame(f, sp, colors, i, N, top_indices, sort_idx):
        fig, ax = plt.subplots(subplot_kw=dict(projection="polar"))
        __, _, indices = get_dominant(f, sp[:, i], N=N)
        # draw a circle, with each slice representing a frequency.
        # each slice is colored according to the frequency's color
        # only the dominant frequencies are shown in each frame.
        # each wedge is a frequency, and its radius is the relative magnitude

        # get the colors of the dominant frequencies
        C = len(colors)
        # colors = colors[indices]
        rel_mag = np.zeros(C) + 0.5
        # remove elements of `indices` that are not in `top_indices`
        idx = indices[np.isin(indices, top_indices)]
        # print(idx, indices, top_indices, sort_idx)
        if len(idx) > 0:
            # find the location of idx in top_indices
            idx = np.where(top_indices == idx[:, None])[1]
            rel_mag[idx] = sp[idx, i] / sp[idx, i].max() + 1
            # rel_mag[~idx] = sp[~idx, i] / sp[~idx, i].max()
        # draw the wedges. each has equal angle, and the radius is the relative magnitude
        ax.bar(
            np.arange(C) * 2 * np.pi / C,
            rel_mag[sort_idx],
            width=2 * np.pi / C,
            color=colors,
            alpha=1,
        )
        ax.set_title(f"{i:06d}")
        ax.axis("off")
        ax.set_ylim(0, 2)
        out_path = f"out/frames/frame-{i:06d}.png"
        fig.savefig(out_path)
        return out_path

    make_frame(f, sp, colors, 426, N, top_indices, sort_idx)

    # slow...
    # frames = list(map(lambda i: make_frame(f, sp, colors, i, N, top_indices), range(sp.shape[1])))

    # faster
    import multiprocessing as mp

    pool = mp.Pool(mp.cpu_count())
    frames = pool.starmap(
        make_frame,
        [(f, sp, colors, i, N, top_indices, sort_idx) for i in range(sp.shape[1])],
    )
    pool.close()
    import os

    os.system(
        "ffmpeg -r 10 -f image2 -s 1920x1080 -i out/frames/frame-%06d.png -vcodec libx264 -crf 25  -pix_fmt yuv420p test.mp4"
    )