Multi-Model Pipelines

A multi-model pipeline sends audio (or metadata) through a sequence of AI models, each transforming the signal in a way the previous model could not. The output of one step becomes the input of the next.

Core Pipeline Topology

[Text Prompt]
      │
      ▼
┌─────────────┐    full song    ┌──────────────────┐
│  Suno / v3  │ ─────────────▶ │  Stem Separator  │  (Demucs / Spleeter)
└─────────────┘                └──────────────────┘
                                        │
                      ┌────────────────┬┴────────────────┐
                      ▼                ▼                  ▼
                  [Vocals]       [Drums]            [Instrumental]
                      │                                   │
                      ▼                                   ▼
              ┌───────────────┐                ┌──────────────────┐
              │  Voice Clone  │                │  MusicGen Remix  │
              │  (RVC / so-vits)│               │  (melody cond.)  │
              └───────────────┘                └──────────────────┘
                      │                                   │
                      └──────────────┬────────────────────┘
                                     ▼
                            ┌─────────────────┐
                            │  DAW / FFmpeg   │  (mix & master)
                            │  Final Mixdown  │
                            └─────────────────┘

Pipeline 1: Generate → Extend → Master

Goal: Produce a full-length track (3–5 min) from a single prompt.

Models used: Sonauto v3 (generation) → Sonauto /v2/extend (extension) → FFmpeg (normalization)

import os, time, requests

BASE = "https://api.sonauto.ai/v1"
H = {"Authorization": f"Bearer {os.environ['SONAUTO_API_KEY']}", "Content-Type": "application/json"}

def poll(task_id: str) -> dict:
    while True:
        status = requests.get(f"{BASE}/generations/status/{task_id}", headers=H).text.strip('"')
        if status == "SUCCESS":
            return requests.get(f"{BASE}/generations/{task_id}", headers=H).json()
        if status == "FAILURE":
            raise RuntimeError(f"Task {task_id} failed")
        time.sleep(5)

# Step 1 — Generate seed track with v3
seed = requests.post(f"{BASE}/generations/v3", headers=H, json={
    "prompt": "cinematic lo-fi hip-hop with melancholic piano",
    "tags": ["lo-fi", "cinematic", "piano"],
    "instrumental": True
}).json()
seed_result = poll(seed["task_id"])
seed_url = seed_result["song_paths"][0]
print(f"Seed: {seed_url}")

# Step 2 — Extend right by 85 s (max per call)
ext1 = requests.post(f"{BASE}/generations/v2/extend", headers=H, json={
    "audio_url": seed_url,
    "side": "right",
    "extend_duration": 85.0,
    "tags": ["lo-fi", "cinematic", "piano"],
    "prompt": "Continue the mood, add subtle string swells"
}).json()
ext1_result = poll(ext1["task_id"])
extended_url = ext1_result["song_paths"][0]
print(f"Extended: {extended_url}")

# Step 3 — Normalize with FFmpeg
import subprocess
subprocess.run([
    "ffmpeg", "-i", extended_url,
    "-af", "loudnorm=I=-14:LRA=7:TP=-1",
    "-c:a", "mp3", "-b:a", "320k",
    "final_track.mp3"
], check=True)
print("Done → final_track.mp3")

Pipeline 2: Suno → Stems → MusicGen Remix

Goal: Take an AI-generated song, isolate its melody line, and feed it to MusicGen for a style transfer.

Models used: Suno (full song) → Demucs (stem separation) → MusicGen (melody-conditioned generation)

import torchaudio
from audiocraft.models import MusicGen
import subprocess, os

SUNO_AUDIO = "suno_output.mp3"   # download from Suno manually or via API

# Step 1 — Separate stems with Demucs
subprocess.run(["python", "-m", "demucs", "--two-stems=vocals", SUNO_AUDIO], check=True)
# Outputs: separated/htdemucs/suno_output/{vocals,no_vocals}.wav

melody_path = f"separated/htdemucs/suno_output/no_vocals.wav"

# Step 2 — Load melody waveform
melody_wav, sr = torchaudio.load(melody_path)
melody_wav = melody_wav.unsqueeze(0)   # [1, C, T]

# Step 3 — Condition MusicGen on the melody
model = MusicGen.get_pretrained("melody")
model.set_generation_params(duration=30)

output = model.generate_with_chroma(
    descriptions=["synthwave remix, pulsing bass, neon aesthetic"],
    melody_wavs=melody_wav,
    melody_sample_rate=sr
)

torchaudio.save("remixed.wav", output[0].cpu(), sample_rate=32000)
print("Remix saved → remixed.wav")

Pipeline 3: Inpaint Loop (Selective Section Replacement)

Goal: Iteratively improve a weak section of a generated song.

Models used: Sonauto v2 (generation + inpaint) → human review loop

import os, time, requests, json

BASE = "https://api.sonauto.ai/v1"
H = {"Authorization": f"Bearer {os.environ['SONAUTO_API_KEY']}", "Content-Type": "application/json"}

def poll(task_id):
    while True:
        s = requests.get(f"{BASE}/generations/status/{task_id}", headers=H).text.strip('"')
        if s == "SUCCESS":
            return requests.get(f"{BASE}/generations/{task_id}", headers=H).json()["song_paths"][0]
        if s == "FAILURE":
            raise RuntimeError("Failed")
        time.sleep(5)

# Initial generation
initial = poll(requests.post(f"{BASE}/generations/v2", headers=H, json={
    "tags": ["rock", "anthemic"],
    "lyrics": "[Verse]\nI walk alone at midnight\n[Chorus]\nRise above the noise"
}).json()["task_id"])

current_url = initial
print(f"Initial: {current_url}")

# Inpaint the chorus section (30s–60s) with improved lyrics
improved = poll(requests.post(f"{BASE}/generations/v2/inpaint", headers=H, json={
    "audio_url": current_url,
    "tags": ["rock", "anthemic"],
    "lyrics": "Soar beyond the fire, never surrender",
    "sections": [[30.0, 60.0]]
}).json()["task_id"])

print(f"Inpainted: {improved}")

Pipeline 4: Fan-out Variation + Best-of Selection

Goal: Generate N variations in parallel, evaluate them, and return the best one.

import os, asyncio, aiohttp

BASE = "https://api.sonauto.ai/v1"
KEY = os.environ["SONAUTO_API_KEY"]

async def generate_one(session, prompt, tags):
    async with session.post(f"{BASE}/generations/v3",
                            json={"prompt": prompt, "tags": tags, "instrumental": True},
                            headers={"Authorization": f"Bearer {KEY}"}) as r:
        return (await r.json())["task_id"]

async def poll_one(session, task_id):
    while True:
        async with session.get(f"{BASE}/generations/status/{task_id}",
                               headers={"Authorization": f"Bearer {KEY}"}) as r:
            status = (await r.text()).strip('"')
        if status == "SUCCESS":
            async with session.get(f"{BASE}/generations/{task_id}",
                                   headers={"Authorization": f"Bearer {KEY}"}) as r:
                return (await r.json())["song_paths"][0]
        if status == "FAILURE":
            return None
        await asyncio.sleep(5)

async def fan_out(prompt, tags, n=3):
    async with aiohttp.ClientSession() as session:
        task_ids = await asyncio.gather(*[generate_one(session, prompt, tags) for _ in range(n)])
        urls = await asyncio.gather(*[poll_one(session, tid) for tid in task_ids])
    return [u for u in urls if u]

# Run 3 variations, pick manually or score with a classifier
variations = asyncio.run(fan_out(
    "uplifting electronic track for a product launch",
    ["electronic", "uplifting", "2020s"]
))
for i, url in enumerate(variations):
    print(f"Variation {i+1}: {url}")

Choosing a Pipeline

Scenario	Recommended Pipeline
Need a full 3+ min track	Generate → Extend
Want style transfer on existing audio	Stems → MusicGen melody cond.
One weak section in an otherwise good take	Inpaint loop
Exploring creative directions	Fan-out + best-of
Client needs vocal + custom instrumental	Suno vocals → Demucs → MusicGen instrumental

Core Pipeline Topology​

Pipeline 1: Generate → Extend → Master​

Pipeline 2: Suno → Stems → MusicGen Remix​

Pipeline 3: Inpaint Loop (Selective Section Replacement)​

Pipeline 4: Fan-out Variation + Best-of Selection​

Choosing a Pipeline​

Related​