switch to realtime API

2025-05-27 16:02:03 +07:00 · 2025-05-27 16:02:03 +07:00 · 90aebf681a
commit 90aebf681a
parent bc4ebeff8c
1 changed files with 76 additions and 110 deletions
--- a/src/App.jsx
+++ b/src/App.jsx
@ -1,5 +1,5 @@
-// ai-tutor-poc/src/App.jsx
+// src/App.jsx
-import React, { useState, useRef, useEffect } from 'react';
+import React, { useState, useRef } from 'react';
 import './App.css';
 const OPENAI_API_KEY = import.meta.env.VITE_OPENAI_API_KEY;
@ -8,9 +8,10 @@ function App() {
  const [transcript, setTranscript] = useState('');
  const [aiReply, setAiReply] = useState('');
  const [isRecording, setIsRecording] = useState(false);
  const wsRef = useRef(null);
  const mediaRecorderRef = useRef(null);
-  const audioChunksRef = useRef([]);
+
  const audioStreamRef = useRef(null);
  const systemContent = `คุณเป็นครูสอนภาษาอังกฤษที่ใจดี เป็นกันเอง และสอนภาษาอังกฤษผ่านการใช้เรื่องราวหรือบทสนทนาได้เป็นอย่างดีอและสนุกสนาน
 คุณได้รับมอบหมายให้พูดคุยกับนักเรียนระดับประถมที่เรียนภาษาอังกฤษเป็นภาษาต่างประเทศ (EFL) เพื่อช่วยให้นักเรียนเรียนรู้ภาษาอังกฤษอย่างเป็นธรรมชาติ
@ -81,129 +82,94 @@ Teacher: Good job! How about Monster – a big scary creature.
 Student will try to pronounce
 Teacher: Good job! How about Magic – something special and powerful.
-`;
+`; // (keep your full system prompt here)
-  useEffect(() => {
+  const connectRealtime = async () => {
-    const initRecording = async () => {
+    if (wsRef.current) return;
      audioStreamRef.current = await navigator.mediaDevices.getUserMedia({ audio: true });
      mediaRecorderRef.current = new MediaRecorder(audioStreamRef.current);
-      mediaRecorderRef.current.ondataavailable = (e) => {
+    const ws = new WebSocket(`wss://api.openai.com/v1/realtime?authorization=Bearer ${OPENAI_API_KEY}`);
-        audioChunksRef.current.push(e.data);
+    wsRef.current = ws;
      };
-      mediaRecorderRef.current.onstop = async () => {
+    ws.onopen = () => {
-        const inputAudioBlob = new Blob(audioChunksRef.current, { type: 'audio/webm' });
+      ws.send(JSON.stringify({
-        audioChunksRef.current = []; // Clear for next recording
+        type: 'start',
-        const formData = new FormData();
+        model: 'gpt-4o',
-        formData.append('file', inputAudioBlob, 'input.webm');
+        config: {
-        formData.append('model', 'whisper-1');
+          audio: {
-
+            input: { encoding: 'webm-opus' },
-        const whisperRes = await fetch('https://api.openai.com/v1/audio/transcriptions', {
+            output: { voice: 'nova' }
          method: 'POST',
          headers: {
            Authorization: `Bearer ${OPENAI_API_KEY}`
          },
          body: formData
        });
        const { text } = await whisperRes.json();
        setTranscript(text);
        setAiReply(''); // Clear previous reply
        let fullText = '';
        const response = await fetch('https://api.openai.com/v1/chat/completions', {
          method: 'POST',
          headers: {
            'Authorization': `Bearer ${OPENAI_API_KEY}`,
            'Content-Type': 'application/json'
          },
          body: JSON.stringify({
            model: 'gpt-4o',
            stream: true,
            messages: [
              { role: 'system', content: systemContent },
              { role: 'user', content: text }
            ]
          })
        });
        const reader = response.body.getReader();
        const decoder = new TextDecoder("utf-8");
        while (true) {
          const { done, value } = await reader.read();
          if (done) break;
          const chunk = decoder.decode(value);
          const lines = chunk.split('\n').filter(line => line.trim() !== '');
          for (const line of lines) {
            if (line.startsWith('data: ')) {
              const data = line.replace('data: ', '');
              if (data === '[DONE]') {
                // After full reply received, synthesize speech
                const speechRes = await fetch('https://api.openai.com/v1/audio/speech', {
                  method: 'POST',
                  headers: {
                    'Authorization': `Bearer ${OPENAI_API_KEY}`,
                    'Content-Type': 'application/json'
                  },
                  body: JSON.stringify({
                    model: 'tts-1-hd',
                    voice: 'nova',
                    input: fullText
                  })
                });
                const outputAudioBlob = await speechRes.blob();
                const audioUrl = URL.createObjectURL(outputAudioBlob);
                const audio = new Audio(audioUrl);
                audio.play();
                return;
              }
              const parsed = JSON.parse(data);
              const delta = parsed.choices?.[0]?.delta?.content;
              if (delta) {
                fullText += delta;
                setAiReply(prev => prev + delta);
              }
            }
          }
-        }
+        },
-      };
+        messages: [{ role: 'system', content: systemContent }]
      }));
    };
-    initRecording();
+    ws.onmessage = (event) => {
-  }, []);
+      const msg = JSON.parse(event.data);
      if (msg.type === 'transcript') {
        setTranscript(msg.text || '');
      } else if (msg.type === 'content') {
        setAiReply(prev => prev + (msg.delta || ''));
      } else if (msg.type === 'audio') {
        const audioBlob = new Blob([msg.audio], { type: 'audio/mpeg' });
        const audioUrl = URL.createObjectURL(audioBlob);
        const audio = new Audio(audioUrl);
        audio.play().catch(() => {
          console.warn("Mobile autoplay may be blocked until user gesture");
        });
      }
    };
-  const startRecording = () => {
+    ws.onerror = (err) => console.error("WebSocket error", err);
-  if (!isRecording && mediaRecorderRef.current) {
+    ws.onclose = () => {
-    audioChunksRef.current = [];
+      console.log("WebSocket closed");
-    mediaRecorderRef.current.start();
+      wsRef.current = null;
    };
  };
  const startRecording = async () => {
    await connectRealtime();
    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
    const recorder = new MediaRecorder(stream, {
      mimeType: 'audio/webm;codecs=opus',
      audioBitsPerSecond: 64000
    });
    recorder.ondataavailable = (e) => {
      if (e.data.size > 0 && wsRef.current?.readyState === WebSocket.OPEN) {
        wsRef.current.send(e.data);
      }
    };
    recorder.start(250); // Send every 250ms
    mediaRecorderRef.current = recorder;
    setTranscript('');
    setAiReply('');
    setIsRecording(true);
    }
  };
  const stopRecording = () => {
-    if (isRecording && mediaRecorderRef.current) {
+    if (mediaRecorderRef.current) {
      mediaRecorderRef.current.stop();
-      setIsRecording(false);
+      mediaRecorderRef.current.stream.getTracks().forEach(track => track.stop());
      mediaRecorderRef.current = null;
    }
    setIsRecording(false);
    if (wsRef.current?.readyState === WebSocket.OPEN) {
      wsRef.current.send(JSON.stringify({ type: 'stop' }));
    }
  };
  return (
    <div className="app-container">
      <div className="page-title">
-          <strong>AI English Tutor - Yesterday's movie</strong>
+        <strong>AI English Tutor - Yesterday's movie</strong>
-        </div>
+      </div>
      <div className="scene-wrapper">
-        
+      <div className="scene-wrapper">
        <img src="/tutor_f.png" alt="Tutor Avatar" className="avatar" />
        <div className="dialogue-box">
@ -217,11 +183,11 @@ Teacher: Good job! How about Magic – something special and powerful.
        <div className="button-container">
          <button
            onMouseDown={startRecording}
            onMouseUp={stopRecording}
            onMouseLeave={stopRecording}      // Ensures it stops if mouse leaves the button
            onTouchStart={startRecording}
            onTouchEnd={stopRecording}
            onMouseDown={startRecording}
            onMouseUp={stopRecording}
            onMouseLeave={stopRecording}
            className={`control-button ${isRecording ? 'recording' : 'idle'}`}
          >
            {isRecording ? 'กำลังพูด...' : 'กดค้างเพื่อพูด'}