BirdVisualizer/app.py

from flask import Flask, request, jsonify, render_template
import librosa
import numpy as np
import tempfile
import os

app = Flask(__name__)
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024  # 100 MB limit

ALLOWED_EXTENSIONS = {'.mp3', '.wav', '.ogg', '.flac', '.m4a'}


@app.route('/')
def index():
    return render_template('index.html')


@app.route('/analyze', methods=['POST'])
def analyze():
    if 'file' not in request.files:
        return jsonify({'error': 'No file provided'}), 400

    file = request.files['file']
    if not file or file.filename == '':
        return jsonify({'error': 'No file selected'}), 400

    ext = os.path.splitext(file.filename)[1].lower()
    if ext not in ALLOWED_EXTENSIONS:
        return jsonify({'error': f'Unsupported file type. Allowed: {", ".join(ALLOWED_EXTENSIONS)}'}), 400

    tmp_path = None
    try:
        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
            file.save(tmp.name)
            tmp_path = tmp.name

        result = analyze_audio(tmp_path)
        return jsonify(result)
    except Exception as e:
        return jsonify({'error': f'Analysis failed: {str(e)}'}), 500
    finally:
        if tmp_path and os.path.exists(tmp_path):
            os.unlink(tmp_path)


def analyze_audio(file_path, max_points=20000):
    # Load audio as mono at 22050 Hz
    y, sr = librosa.load(file_path, mono=True, sr=22050)

    n_fft = 2048
    hop_length = 512

    # Short-Time Fourier Transform
    S = librosa.stft(y, n_fft=n_fft, hop_length=hop_length)
    magnitude = np.abs(S)  # shape: (n_freq_bins, n_frames)

    times = librosa.frames_to_time(
        np.arange(magnitude.shape[1]), sr=sr, hop_length=hop_length
    )
    freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft)

    # Build flat coordinate arrays using a meshgrid
    freq_grid, time_grid = np.meshgrid(freqs, times, indexing='ij')
    flat_mag = magnitude.flatten()
    flat_t = time_grid.flatten()
    flat_f = freq_grid.flatten()

    # Remove noise floor — keep top 25% loudest points
    threshold = np.percentile(flat_mag, 75)
    mask = flat_mag >= threshold
    flat_mag = flat_mag[mask]
    flat_t = flat_t[mask]
    flat_f = flat_f[mask]

    # If still too many, keep top N by magnitude
    if len(flat_mag) > max_points:
        top_idx = np.argpartition(flat_mag, -max_points)[-max_points:]
        flat_mag = flat_mag[top_idx]
        flat_t = flat_t[top_idx]
        flat_f = flat_f[top_idx]

    # Normalize each dimension to [0, 1]
    def norm(arr):
        lo, hi = arr.min(), arr.max()
        return (arr - lo) / (hi - lo + 1e-10)

    t_norm = norm(flat_t)
    f_norm = norm(flat_f)
    a_norm = norm(flat_mag)

    return {
        # Raw values (for display / audio sync)
        't': flat_t.round(4).tolist(),
        'f': flat_f.round(2).tolist(),
        'a': flat_mag.round(6).tolist(),
        # Normalized values [0,1] used for 3D coordinates
        'tn': t_norm.round(6).tolist(),
        'fn': f_norm.round(6).tolist(),
        'an': a_norm.round(6).tolist(),
        # Metadata
        'duration': round(float(times[-1]), 3),
        'sampleRate': int(sr),
        'pointCount': len(flat_t),
    }


if __name__ == '__main__':
    # NOTE: MP3 support requires ffmpeg to be installed and on PATH.
    # WAV files work out of the box.
    app.run(debug=True, port=5000)