import librosa import numpy as np import matplotlib.pyplot as plt import scipy.io.wavfile as wavfile import plotly.express as px def load_audio(filename: str, mono: bool = False) -> (np.ndarray, int): audio, sr = librosa.load(filename, sr=None, mono=mono) # sr=None ensures original sampling rate is used return audio, sr def get_fourier_transform(audio: np.ndarray, sr: int, n: int = None) -> np.ndarray: # Compute the Fourier Transform fft_data = np.fft.fft(audio, n=n) # Compute the magnitude (amplitude) of the complex numbers fft_magnitude = np.abs(fft_data) # Create frequency bins freq = np.fft.fftfreq(len(fft_data), d=1 / sr) return freq, fft_magnitude def plot_audio(audio: np.ndarray, sr: int, secs: int = 10) -> None: if audio.ndim == 1: audio = np.array([audio, audio]) sample = audio[:, : (sr * secs)] fig, ax = plt.subplots() ax.plot(sample[0], label="L", c="xkcd:azure", alpha=0.5) ax.plot(sample[1], label="R", c="xkcd:orange", alpha=0.5) plt.savefig(f"out/{secs:03d}.png") def plot_fourier(audio, secs, sr): print("fourier") freq, fft_magnitude = get_fourier_transform(audio[0, : (secs * sr)], sr) plt.figure(figsize=(10, 6)) plt.bar(freq, fft_magnitude, width=1.0) # Using bar plot to mimic histogram plt.xlabel("Frequency (Hz)") plt.ylabel("Magnitude") plt.xscale("log") # Logarithmic scale for frequency axis plt.title("Frequency Spectrum") plt.savefig(f"out/fourier-{secs:03d}.png") def get_spectra(audio, secs, sr): print("spectra") # NFFT: Number of data points used in each block for the FFT. A larger value provides better frequency resolution but worse time resolution. 2048 is a common choice. # noverlap: The number of points of overlap between blocks. 1024 is half of NFFT, which is a common choice. # Plotting the Spectrogram using plt.specgram plt.figure(figsize=(10, 6)) Pxx, freqs, bins, im = plt.specgram( audio[1, : (secs * sr)], Fs=sr, NFFT=2048, noverlap=1024, cmap="viridis", scale="dB", ) plt.xlabel("Time (s)") plt.ylabel("Frequency (Hz)") plt.colorbar(im).set_label("Intensity (dB)") plt.title("Spectrogram") plt.savefig(f"out/spectra-{secs:03d}.png") return Pxx, freqs, bins, im def reconstruct_signal( freq, mag, sr: int = 44100, secs: int = 1, normalize: bool = True ): # Sample rate and duration of the original audio duration = secs # seconds # Create an array for time t = np.linspace(0, duration, duration * sr, endpoint=False) # Initialize the reconstructed signal as zeros reconstructed_signal = np.zeros(len(t)) # Add each dominant frequency as a sine wave for freq, magnitude in zip(freq, mag): # Generate a sine wave for the dominant frequency sine_wave = magnitude * np.sin(2 * np.pi * freq * t) # Add the sine wave to the reconstructed signal reconstructed_signal += sine_wave # Normalize the reconstructed signal (optional) if normalize: reconstructed_signal /= np.max(np.abs(reconstructed_signal)) return t, reconstructed_signal def save_sound(wave: np.ndarray, sr: int = 44100, filename: str = "out.mp3"): wavfile.write(filename, sr, wave.astype(np.float32)) def get_dominant(freq: np.ndarray, fft_magnitude: np.ndarray, N: int = 5): # sr / 2 is the number of frequencies we can resolve (Nyquist Theorem) N = min(N, len(freq)) # these frequencies we can resolve are the positive ones. # negative ones contain phase information (ignored for now) pos_fft_magnitude = fft_magnitude[freq > 0] pos_freq = freq[freq > 0] top_indices = np.argsort(pos_fft_magnitude)[::-1][:N] dominant_frequencies = pos_freq[top_indices] dominant_magnitudes = pos_fft_magnitude[top_indices] return dominant_frequencies, dominant_magnitudes, top_indices def custom_specgram(audio, sr, NFFT: int = 2048, noverlap: int = 1024): step = NFFT - noverlap segments = int((len(audio) - noverlap) / step) specgram = np.zeros((1 + NFFT // 2, segments)) for i in range(segments): # Extract the segment start = i * step end = start + NFFT segment = audio[start:end] # Apply FFT fft_segment = np.fft.fft(segment, n=NFFT) # Extract positive frequencies pos_fft = fft_segment[: NFFT // 2 + 1] # Store the magnitudes specgram[:, i] = np.abs(pos_fft) fft_freq = np.fft.fftfreq(NFFT, d=1 / sr) fft_freq = fft_freq[: NFFT // 2 + 1] return specgram, fft_freq if __name__ == "__main__": filename = "clip.mp3" audio, sr = load_audio(filename) print(audio) print(sr) # secs = len(audio[0]) // sr secs = 10 # number of seconds print(f"Analyzing {filename} ({secs} seconds)") plot_audio(audio, sr, secs=secs) # plot_fourier(audio, secs, sr) vol, freqs, time, im = get_spectra(audio, secs, sr) # take the logarithm to map data to exponents. new range approx (-50, 0) logvol = np.log(vol) # rescale it to 0 - 1 (relative volume) logvol_scaled = (logvol - logvol.min()) / (logvol.max() - logvol.min()) print(audio.shape, vol.shape, freqs.shape, time.shape) print(time) print(freqs) full_vol = logvol_scaled.sum(axis=1) # Analysis of frequencies in section of song (as a whole, not over-time) freq, fft_magnitude = get_fourier_transform(audio[0, : (secs * sr)], sr) N = 10 dominant_frequencies, dominant_magnitudes, _ = get_dominant( freq, fft_magnitude, N=N ) # Reconstruct signal from dominant frequencies t, new_sig = reconstruct_signal( dominant_frequencies, dominant_magnitudes, sr=sr, secs=secs ) save_sound(new_sig, sr=sr, filename="reconstructed_audio.mp3") # Image fig, ax = plt.subplots() ax.set_title("Reconstructed signal") ax.plot(t, new_sig) fig.savefig(f"out/reconstructed_signal_{secs}s.png") fig = px.line(x=t, y=new_sig, title="Reconstructed signal") fig.write_image(f"out/reconstructed_signal_{secs}s.png") fig.write_html(f"out/index.html") print(f"Using top {N} frequencies ({100*N/sr/2:2.2f}%)") # Print the dominant frequencies and their magnitudes for i in range(N): print( f"Dominant Frequency {i + 1}: {dominant_frequencies[i]} Hz, Magnitude: {dominant_magnitudes[i]}" ) fig, ax = plt.subplots() # ax.bar(pos_freq, pos_fft_magnitude) ax.bar( dominant_frequencies, dominant_magnitudes / dominant_magnitudes.max(), color="red", ) ax.set_title(f"Dominant Frequencies from first {secs}s") fig.savefig("out/dominant.png") secs = 10 sp, f = custom_specgram(audio[0, : (secs * sr)], sr) fig, ax = plt.subplots() sp = np.log(sp) ax.imshow(sp, aspect="auto", origin="lower") fig.savefig(f"out/spec-{secs}.png") N = 10 # Now getting it over each time slice fig, ax = plt.subplots() for i in range(sp.shape[1]): # ax.bar(pos_freq, pos_fft_magnitude) dominant_frequencies, dominant_magnitudes, _ = get_dominant(f, sp[:, i], N=N) _ = ax.bar( dominant_frequencies, dominant_magnitudes / dominant_magnitudes.max() ) ax.set_title(f"Dominant Frequencies over time") fig.savefig("out/time_dominant.png") # Let's make an animation. Each frame is a time slice # get viridis colors import matplotlib.cm as cm cmap = cm.get_cmap("viridis") # assign each frequency a color (f) top_freq, _, top_indices = get_dominant(f, sp.sum(axis=1), N=50) sort_idx = np.argsort(top_freq) # frequencies correspond to colors. colors = cmap(top_freq[sort_idx] / top_freq.max()) def make_frame(f, sp, colors, i, N, top_indices, sort_idx): fig, ax = plt.subplots(subplot_kw=dict(projection="polar")) __, _, indices = get_dominant(f, sp[:, i], N=N) # draw a circle, with each slice representing a frequency. # each slice is colored according to the frequency's color # only the dominant frequencies are shown in each frame. # each wedge is a frequency, and its radius is the relative magnitude # get the colors of the dominant frequencies C = len(colors) # colors = colors[indices] rel_mag = np.zeros(C) + 0.5 # remove elements of `indices` that are not in `top_indices` idx = indices[np.isin(indices, top_indices)] # print(idx, indices, top_indices, sort_idx) if len(idx) > 0: # find the location of idx in top_indices idx = np.where(top_indices == idx[:, None])[1] rel_mag[idx] = sp[idx, i] / sp[idx, i].max() + 1 # rel_mag[~idx] = sp[~idx, i] / sp[~idx, i].max() # draw the wedges. each has equal angle, and the radius is the relative magnitude ax.bar( np.arange(C) * 2 * np.pi / C, rel_mag[sort_idx], width=2 * np.pi / C, color=colors, alpha=1, ) ax.set_title(f"{i:06d}") ax.axis("off") ax.set_ylim(0, 2) out_path = f"out/frames/frame-{i:06d}.png" fig.savefig(out_path) return out_path make_frame(f, sp, colors, 426, N, top_indices, sort_idx) # slow... # frames = list(map(lambda i: make_frame(f, sp, colors, i, N, top_indices), range(sp.shape[1]))) # faster import multiprocessing as mp pool = mp.Pool(mp.cpu_count()) frames = pool.starmap( make_frame, [(f, sp, colors, i, N, top_indices, sort_idx) for i in range(sp.shape[1])], ) pool.close() import os os.system( "ffmpeg -r 10 -f image2 -s 1920x1080 -i out/frames/frame-%06d.png -vcodec libx264 -crf 25 -pix_fmt yuv420p test.mp4" )