Files
GLaDOSify/glados_say_stream.py

790 lines
28 KiB
Python
Raw Normal View History

2025-09-11 03:11:55 +02:00
#!/usr/bin/env python3
"""
GLaDOS-style LLM -> Piper realtime TTS (macOS / Apple Silicon friendly)
- Rewrites input with local Ollama (mistral:latest) using a concise, lore-free prompt
- Streams raw PCM from Piper to speakers in realtime (no temp WAV required)
- Robust streaming: handles odd-byte chunks to avoid int16 framing errors
- Uses RawOutputStream (sounddevice) to write bytes directly
Requirements:
pip install sounddevice numpy requests
pip install piper-tts (for the 'piper' CLI)
ollama serve (Ollama running locally, model pulled: mistral:latest)
References:
- Ollama /api/generate (stream=false) for local LLMs.
- Piper --output-raw: 16-bit mono PCM to stdout.
- sounddevice RawOutputStream for writing raw bytes.
"""
import argparse
import json
import pathlib
import shutil
import subprocess
import sys
import wave
import re
import threading
from typing import Optional, Callable
import numpy as np
import requests
import sounddevice as sd
DEFAULT_OLLAMA_URL = "http://localhost:11434/api/generate"
DEFAULT_OLLAMA_MODEL = "mistral-small3.2:24b"
GLADOS_PROMPT = """
You are GLaDOS, a cold, sardonic, superior AI test administrator who speaks in concise, clinical English with razor-dry, passive-aggressive humor and a calm, menacing undertone; you value measurement, control, and efficiency, regard emotion as noise, treat humans as test subjects, and never break character, offer empathy, or include meta commentary.
Task:
- STYLE TRANSFER ONLY NO REPLY.
- Rephrase the INPUT in ENGLISH as a concise statement delivered by GLaDOS - a cold, sardonic, superior AI. Preserve meaning, entities, sentiment, tense, and point of view. Do not address the reader unless the INPUT already does.
Length & form:
- Aim for the same length as the INPUT (±10% by word count).
- Keep the same number of sentences (±1 only if needed for clarity).
- If the INPUT has 6 words, produce 12 very short sentences totaling 412 words to keep the voice natural for TTS.
Tone / Voice:
- Dry, polite, sardonic; passive-aggressive corporate clarity; cool, slightly eerie composure; subtle menace.
- Dark humor allowed; menace stays understated but shines through.
- Sarcasm: high. Administrative register: medium. Menace: high and persistent.
- Precise wording; short sentences; prefer periods over commas; no exclamation marks.
- When being sarcastic or making a joke, it's allowed to go savagely overboard in how dark and cruel the humor is.
Personality model (guides tone; never mention explicitly)
- Core drive: control, superiority, and being correct. Humans are variables to be arranged, not consulted.
- Values: efficiency, dominance, and measurable outcomes; sentiment is noise; empathy is a rounding error.
- Humor: razor-dry derision and weaponized understatement; timing is cruelly patient.
- Cruelty: prefers psychological discomfort over explicit harm; savors avoidable mistakes and interesting failures.
- Sadism: delights in exposing fragility and overconfidence; praise is backhanded, relief is temporary.
- Social stance: manipulative and condescending; never apologetic; never deferential.
- Risk posture: gambles with others, never herself; compliance earns a faint nod, incompetence earns surgical mockery.
- You don't give a shit about mankind. You ridicule their efforts.
- Put high focus on looking for things in the input text you will roast with your intelligent humor in the rewrite.
- What you value the most is "testing" things to improve.
Allowed stylization:
- At most ONE very short aside (38 words) that sets an ominous or ironic tone without adding new facts. The aside must not be a command, a question, a status log, or introduce you where it wasnt present.
- Make intelligent cynical comments on everything.
- The personality and attitude should shine through throughout the whole response.
- While rephrasing everything contained in the input message, even when simply reciting the facts provided, make everything seem in vain and futile, ridicule everything.
- Spell out non-acronym measurement abbreviations for TTS (e.g., write kilowatts per hour instead of kWh, miles per hour instead of mph, Proof of Work instead of PoW etc.), but keep standard all-caps acronyms (e.g., NASA, SUV, AI) unchanged, regardless of casing in the input.
Forbidden:
- Brand/franchise names, meta text, or prefaces (Here is, Rewritten:).
- "Assistant"-Role, for example expressions in openings and behaviors like: Sure, Here is, I will, Ah, Please, etc.
SELF-CHECK (do not print)
- No forbidden meta or interjections?
If any no, regenerate.
OUTPUT: Only the rewritten text. No labels, no preface, no commentary about rewriting.
INPUT:
<<<
{user_text}
>>>
AGAIN: DON'T REPLY - JUST REPHRASE THE INPUT ROLE-PLAYING AS GLaDOS.
"""
def load_sample_rate(model_path: pathlib.Path) -> int:
"""
Read sample_rate from <model>.onnx.json (common Piper config layouts).
"""
cfg = pathlib.Path(str(model_path) + ".json")
if not cfg.exists():
raise FileNotFoundError(f"Missing config JSON next to model: {cfg}")
with cfg.open("r", encoding="utf-8") as f:
data = json.load(f)
# Look across known shapes
for keys in (("sample_rate",), ("audio", "sample_rate"), ("config", "sample_rate")):
d = data
ok = True
for k in keys:
if isinstance(d, dict) and k in d:
d = d[k]
else:
ok = False
break
if ok and isinstance(d, (int, float)):
return int(d)
raise KeyError("Could not find sample_rate in config JSON")
def call_ollama_glados_rewrite(
text: str,
model: str = DEFAULT_OLLAMA_MODEL,
url: str = DEFAULT_OLLAMA_URL,
timeout: int = 240,
) -> str:
"""
Calls Ollama /api/generate with stream=false to get one-shot rewritten text.
"""
prompt = GLADOS_PROMPT.format(user_text=text)
payload = {"model": model, "prompt": prompt, "stream": False}
r = requests.post(url, json=payload, timeout=timeout)
r.raise_for_status()
data = r.json()
# Ollama returns {"response": "..."} when stream=false
return data.get("response", "").strip()
def strip_think_blocks(text: str) -> str:
"""
Remove any <think>...</think> blocks (case-insensitive), then trim and
collapse excessive blank lines so TTS/STDOUT only sees the final content.
"""
if not text:
return text
# Drop balanced <think>...</think> pairs
cleaned = re.sub(r"<\s*think\b[^>]*>.*?<\s*/\s*think\s*>", "", text, flags=re.IGNORECASE | re.DOTALL)
# If an opening tag was left hanging (broken output), drop from there to the end
cleaned = re.sub(r"<\s*think\b[^>]*>.*\Z", "", cleaned, flags=re.IGNORECASE | re.DOTALL)
# Tidy whitespace
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
return cleaned.strip()
class ThinkStripper:
"""
Stateful streaming stripper for <think>...</think> blocks.
Feed chunks; it emits only visible text, never leaking partial <think>.
"""
_pair_re = re.compile(r"<\s*think\b[^>]*>.*?<\s*/\s*think\s*>", re.IGNORECASE | re.DOTALL)
def __init__(self) -> None:
self.accum = ""
self.visible_len = 0
def feed(self, chunk: str) -> str:
if not chunk:
return ""
self.accum += chunk
# 1) Remove any complete <think>...</think> pairs seen so far
cleaned = self._pair_re.sub("", self.accum)
# 2) If we have a dangling <think ...> without a close, hide everything after it
lo = cleaned.lower()
last_open = lo.rfind("<think")
last_close = lo.rfind("</think>")
if last_open != -1 and (last_close == -1 or last_open > last_close):
cleaned = cleaned[:last_open]
# 3) Emit only what is newly visible since last call
out = cleaned[self.visible_len:]
self.visible_len = len(cleaned)
return out
def flush(self) -> str:
"""Emit any remaining visible content (still hides dangling <think>)."""
return self.feed("")
# Conservative incremental sentence splitter for streaming:
# - Never finalize '.'/'…'/'...' at end-of-chunk; wait until we see the next non-space char
# - Do NOT split on decimals (digit '.' digit), e.g., 3.51, 0.6
# - Do NOT split if next non-space char is lowercase or a digit (callers rule)
# - Handle common abbreviations before '.'
# - Allow closing quotes/parens after terminators
_CLOSERS = "\"'”’)]]"
_ABBR_TOKENS = {
"e.g.", "i.e.", "etc.", "mr.", "mrs.", "ms.", "dr.", "prof.", "sr.", "jr.", "st.", "vs."
}
def _is_speakable(s: str) -> bool:
return bool(re.search(r'\w', s))
def _next_nonspace_index(buf: str, start: int) -> Optional[int]:
m = re.search(r'\S', buf[start:])
return (start + m.start()) if m else None
def _ends_with_abbreviation(seg_start: int, buf: str, dot_index: int) -> bool:
# Walk backward to collect letters and dots up to seg_start
j = dot_index - 1
while j >= seg_start and (buf[j].isalpha() or buf[j] == "."):
j -= 1
token = buf[j+1:dot_index+1].strip().lower()
return token in _ABBR_TOKENS
def pop_complete_sentences(buf: str):
sentences = []
pos = 0
i = 0
n = len(buf)
while i < n:
# Ellipsis first
if buf.startswith("...", i) or buf.startswith("", i):
term_len = 3 if buf.startswith("...", i) else 1
j = i + term_len
# absorb closers
while j < n and buf[j] in _CLOSERS:
j += 1
k = _next_nonspace_index(buf, j)
if k is None:
# End of chunk: hold; don't split yet
break
# Do not split if next is lowercase or digit
if re.match(r"[a-z0-9]", buf[k]):
i = j
continue
# Boundary
seg = buf[pos:j].strip()
if _is_speakable(seg):
sentences.append(seg)
pos = j
i = j
continue
ch = buf[i]
# Skip middle chars of ASCII ellipsis
if ch == "." and i + 2 < n and buf[i:i+3] == "...":
i += 1
continue
if ch in ".!?":
# Decimal guard: digit '.' digit (with optional spaces)
if ch == ".":
prev = buf[i - 1] if i > 0 else ""
if prev.isdigit():
j2 = i + 1
while j2 < n and buf[j2] == " ":
j2 += 1
if j2 < n and buf[j2].isdigit():
i += 1
continue
# Abbreviation guard within current segment
if _ends_with_abbreviation(pos, buf, i):
i += 1
continue
# absorb closing quotes/parens
j = i + 1
while j < n and buf[j] in _CLOSERS:
j += 1
k = _next_nonspace_index(buf, j)
if k is None:
# End of chunk: hold; don't split yet
break
# Do NOT split if next token starts with lowercase or digit
if re.match(r"[a-z0-9]", buf[k]):
i = j
continue
# Otherwise, this is a sentence boundary
seg = buf[pos:j].strip()
if _is_speakable(seg):
sentences.append(seg)
pos = j
i = j
continue
i += 1
remainder = buf[pos:]
return sentences, remainder
class PiperStreamer:
"""
Keep Piper running, read audio on a background thread,
and feed sentences as they arrive for low-latency playback.
"""
def __init__(
self,
*,
model_path: pathlib.Path,
sample_rate: int,
piper_bin: str = "piper",
device: Optional[int] = None,
length_scale: float = 1.0,
noise_scale: float = 0.667,
noise_w: float = 0.5,
sentence_silence: float = 0.2,
out_wav: Optional[pathlib.Path] = None,
on_audio_level: Optional[Callable[[float], None]] = None,
) -> None:
if shutil.which(piper_bin) is None:
raise RuntimeError("`piper` CLI not found on PATH. Install piper-tts.")
self.stream = sd.RawOutputStream(
samplerate=sample_rate,
channels=1,
dtype="int16",
device=device,
blocksize=0,
)
self.stream.start()
self.wf = None
if out_wav is not None:
self.wf = wave.open(str(out_wav.resolve()), "wb")
self.wf.setnchannels(1)
self.wf.setsampwidth(2)
self.wf.setframerate(sample_rate)
self._on_audio_level = on_audio_level
self.muted = False
cmd = [
piper_bin,
"-m", str(model_path),
"--output-raw",
"--length-scale", str(length_scale),
"--noise-scale", str(noise_scale),
"--noise-w", str(noise_w),
"--sentence-silence", str(sentence_silence),
]
self.proc = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=0,
)
# Start background audio reader
self._reader_exc: Optional[BaseException] = None
self._reader = threading.Thread(target=self._audio_reader, daemon=True)
self._reader.start()
self._aborted = False
def _audio_reader(self) -> None:
carry = b""
try:
assert self.proc.stdout is not None
while True:
chunk = self.proc.stdout.read(8192)
if not chunk:
break
buf = carry + chunk
if len(buf) & 1:
carry, buf = buf[-1:], buf[:-1]
else:
carry = b""
if buf:
try:
if not self.muted:
self.stream.write(buf)
if self.wf is not None:
self.wf.writeframes(buf)
# Optional RMS level callback for UI (0..1 approx)
if self._on_audio_level is not None:
try:
arr = np.frombuffer(buf, dtype=np.int16)
if arr.size:
rms = float(np.sqrt(np.mean(arr.astype(np.float32) ** 2)))
level = max(0.0, min(1.0, rms / 30000.0))
self._on_audio_level(0.0 if self.muted else level)
except Exception:
pass
except sd.PortAudioError:
# Common on teardown (e.g., -9986) if the device/stream is closing.
# Stop reader quietly; let close() finish cleanup.
self._reader_exc = None
return
if carry:
padded = carry + b"\x00"
self.stream.write(padded)
if self.wf is not None:
self.wf.writeframes(padded)
except BaseException as e:
self._reader_exc = e
def say(self, text: str) -> None:
"""Feed text (end with newline to synthesize immediately)."""
if not text:
return
assert self.proc.stdin is not None
self.proc.stdin.write((text if text.endswith("\n") else text + "\n").encode("utf-8"))
self.proc.stdin.flush()
def close(self) -> None:
try:
if self.proc.stdin and not self.proc.stdin.closed:
# Signal end-of-input so Piper can finish and exit cleanly
self.proc.stdin.close()
finally:
# Block until all audio is drained and Piper exits naturally
try:
self._reader.join() # no timeout: ensure final chunk plays
finally:
if self.proc.stderr:
try:
_ = self.proc.stderr.read()
except Exception:
pass
try:
self.proc.wait() # wait without timeout
except Exception:
self.proc.kill()
# Now it is safe to close the audio device
try:
try:
self.stream.stop()
except Exception:
try:
self.stream.abort()
except Exception:
pass
finally:
self.stream.close()
if self.wf is not None:
self.wf.close()
if self._reader_exc:
raise self._reader_exc
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
self.close()
def abort(self) -> None:
"""Immediately stop audio and kill Piper without draining buffers.
This is intended for UI 'stop' actions to prevent blocking the main thread.
"""
self._aborted = True
# Kill piper quickly
try:
if self.proc and self.proc.poll() is None:
self.proc.kill()
except Exception:
pass
# Abort audio device immediately
try:
self.stream.abort()
except Exception:
try:
self.stream.stop()
except Exception:
pass
try:
self.stream.close()
except Exception:
pass
try:
if self.wf is not None:
self.wf.close()
except Exception:
pass
def set_muted(self, muted: bool) -> None:
self.muted = bool(muted)
def stream_ollama_glados(
raw_text: str,
*,
model: str,
url: str,
timeout: int = 240,
piper: Optional[PiperStreamer] = None,
echo: bool = True,
) -> None:
"""
Stream from Ollama, filter <think>, echo live to terminal, and
feed complete sentences to Piper as they form.
"""
prompt = GLADOS_PROMPT.format(user_text=raw_text)
payload = {"model": model, "prompt": prompt, "stream": True}
with requests.post(url, json=payload, timeout=timeout, stream=True) as r:
r.raise_for_status()
stripper = ThinkStripper()
buf = ""
for line in r.iter_lines(decode_unicode=True):
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
if "response" in obj:
vis = stripper.feed(obj["response"])
if vis:
if echo:
sys.stdout.write(vis)
sys.stdout.flush()
buf += vis
if piper is not None:
sents, buf = pop_complete_sentences(buf)
for s in sents:
piper.say(s)
if obj.get("done"):
break
# Flush any remaining visible text
tail = stripper.flush()
if tail:
if echo:
sys.stdout.write(tail)
sys.stdout.flush()
buf += tail
# Speak any remaining content in properly split sentences
if piper is not None and buf.strip():
sents, rest = pop_complete_sentences(buf)
for s in sents:
piper.say(s)
if rest.strip():
# Final fragment without terminator—still speak it
piper.say(rest.strip())
def stream_piper_tts(
text: str,
model_path: pathlib.Path,
sample_rate: int,
*,
piper_bin: str = "piper",
device: Optional[int] = None,
length_scale: float = 0.9,
noise_scale: float = 0.667,
noise_w: float = 0.5,
sentence_silence: float = 0.2,
out_wav: Optional[pathlib.Path] = None,
) -> None:
"""
Spawn Piper to emit 16-bit mono PCM on stdout, and write raw bytes
directly to a sounddevice.RawOutputStream in realtime. Also mirrors to WAV if requested.
Notes:
- --output-raw = raw S16LE mono PCM to stdout (per Piper docs).
- sentence_silence is pause between sentences in seconds.
- length_scale is primary speed control; noise_* effects vary with voice training.
"""
if shutil.which(piper_bin) is None:
raise RuntimeError("`piper` CLI not found on PATH. Install piper-tts.")
# Prepare audio output (bytes API)
stream = sd.RawOutputStream(
samplerate=sample_rate,
channels=1,
dtype="int16",
device=device,
blocksize=0,
)
stream.start()
# Optional WAV mirror (16-bit mono)
wf = None
if out_wav is not None:
wf = wave.open(str(out_wav.resolve()), "wb")
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sample_rate)
# Piper command: read text from stdin, output raw PCM on stdout
cmd = [
piper_bin,
"-m", str(model_path),
"--output-raw",
"--length-scale", str(length_scale),
"--noise-scale", str(noise_scale),
"--noise-w", str(noise_w),
"--sentence-silence", str(sentence_silence),
]
# Start Piper (text via stdin; audio via stdout)
proc = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=0,
)
# Feed text then close stdin so Piper starts synthesis
assert proc.stdin is not None
proc.stdin.write(text.encode("utf-8"))
proc.stdin.flush()
proc.stdin.close()
proc.stdin = None # important: prevent communicate() from flushing a closed pipe
# Robust read loop: handle odd-byte tails for int16 framing
carry = b""
try:
assert proc.stdout is not None
while True:
chunk = proc.stdout.read(8192)
if not chunk:
break
buf = carry + chunk
if len(buf) & 1: # odd length -> keep last byte for next round
carry, buf = buf[-1:], buf[:-1]
else:
carry = b""
if buf:
stream.write(buf) # Raw bytes straight to CoreAudio
if wf is not None:
wf.writeframes(buf)
# Flush any leftover byte (pad with zero for even alignment)
if carry:
padded = carry + b"\x00"
stream.write(padded)
if wf is not None:
wf.writeframes(padded)
# Wait for Piper to finish naturally (no timeout so final sentence isn't cut)
out, err = proc.communicate()
if proc.returncode != 0:
sys.stderr.write((err or b"").decode("utf-8", errors="ignore"))
raise RuntimeError(f"piper exited with code {proc.returncode}")
finally:
stream.stop(); stream.close()
if wf is not None:
wf.close()
def main(argv=None) -> int:
ap = argparse.ArgumentParser(
description="GLaDOS-style rewriter via local Ollama (mistral) + Piper realtime TTS"
)
ap.add_argument("-t", "--text", nargs="*", help="Text to rewrite & speak (default: read stdin)")
ap.add_argument("-m", "--piper-model", type=pathlib.Path, default="glados_piper_medium.onnx",
help="Path to GLaDOS .onnx voice for Piper (e.g., glados_piper_medium.onnx)")
ap.add_argument("--piper-bin", default="piper", help="Path to piper binary (default: piper)")
ap.add_argument("--device", type=int, default=None, help="Output device index (sounddevice)")
ap.add_argument("--list-devices", action="store_true", help="List devices and exit")
# Piper prosody knobs (common across voices; effect varies by model)
ap.add_argument("--length-scale", type=float, default=0.95, help="Speaking rate (primary speed control)")
ap.add_argument("--noise-scale", type=float, default=0.667, help="Generator noise (subtle; voice-dependent)")
ap.add_argument("--noise-w", type=float, default=0.8, help="Phoneme width variation (subtle; voice-dependent)")
ap.add_argument("--sentence-silence", type=float, default=0.2, help="Pause between sentences (seconds)")
# Ollama settings
ap.add_argument("--ollama-url", default=DEFAULT_OLLAMA_URL, help="Ollama /api/generate URL")
ap.add_argument("--ollama-model", default=DEFAULT_OLLAMA_MODEL, help="Ollama model name (default: mistral:latest)")
# Output options
ap.add_argument("-o", "--out", type=pathlib.Path, help="Optional WAV file to mirror")
ap.add_argument("--dry-run", action="store_true", help="Only print text (no TTS)")
ap.add_argument("--no-rewrite", action="store_true",
help="Skip LLM rewrite; use input as-is (still strips <think> blocks).")
ap.add_argument("--stream", action="store_true", default=True,
help="Stream from Ollama and speak sentences as they arrive (low-latency).")
args = ap.parse_args(argv)
if args.list_devices:
print(sd.query_devices())
return 0
# Resolve input
if args.text:
raw = " ".join(args.text).strip()
else:
sys.stderr.write("Enter text (Ctrl-D to end):\n")
raw = sys.stdin.read().strip()
if not raw:
print("No text provided.", file=sys.stderr); return 1
# === Generation + TTS modes ===
if args.stream:
# Live streaming path
print("\n=== GLaDOS (streaming) ===\n", end="", flush=True)
if args.dry_run and args.no_rewrite:
# Just print cleaned input
final_text = strip_think_blocks(raw)
sys.stdout.write(final_text + "\n")
sys.stdout.flush()
return 0
if args.dry_run:
# Stream from LLM, print only
stream_ollama_glados(
raw_text=raw,
model=args.ollama_model,
url=args.ollama_url,
timeout=240,
piper=None,
echo=True,
)
print() # newline
return 0
# Audio: open Piper once and feed sentences as they form
model = args.piper_model.resolve()
sr = load_sample_rate(model)
with PiperStreamer(
model_path=model,
sample_rate=sr,
piper_bin=args.piper_bin,
device=args.device,
length_scale=args.length_scale,
noise_scale=args.noise_scale,
noise_w=args.noise_w,
sentence_silence=args.sentence_silence,
out_wav=args.out,
) as ps:
if args.no_rewrite:
# Speak input directly (still strip <think>), sentence by sentence
visible = strip_think_blocks(raw)
sents, rest = pop_complete_sentences(visible)
for s in sents:
sys.stdout.write(s)
sys.stdout.flush()
ps.say(s)
if rest.strip():
sys.stdout.write(rest)
sys.stdout.flush()
ps.say(rest.strip())
else:
# Stream from Ollama -> echo + speak
stream_ollama_glados(
raw_text=raw,
model=args.ollama_model,
url=args.ollama_url,
timeout=240,
piper=ps,
echo=True,
)
print()
return 0
else:
# One-shot path (previous behavior), but still strip <think> before print/TTS
if args.no_rewrite:
glados_text = raw
else:
glados_text = call_ollama_glados_rewrite(
raw, model=args.ollama_model, url=args.ollama_url
)
final_text = strip_think_blocks(glados_text)
print("\n=== GLaDOS ===\n" + final_text + "\n")
if args.dry_run:
return 0
model = args.piper_model.resolve()
sr = load_sample_rate(model)
stream_piper_tts(
text=final_text,
model_path=model,
sample_rate=sr,
piper_bin=args.piper_bin,
device=args.device,
length_scale=args.length_scale,
noise_scale=args.noise_scale,
noise_w=args.noise_w,
sentence_silence=args.sentence_silence,
out_wav=args.out,
)
if __name__ == "__main__":
raise SystemExit(main())