ggml-org · whitead · Mar 19, 2023 · Mar 19, 2023 · Mar 19, 2023 · Mar 20, 2023
diff --git a/.gitignore b/.gitignore
@@ -23,6 +23,7 @@ build-sanitize-thread/
 /talk
 /talk-llama
 /bench
+/streammq
 /quantize
 
 arm_neon.h

diff --git a/Makefile b/Makefile
@@ -280,7 +280,7 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
 $(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
 
 clean:
-rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
+rm -f *.o main stream streammq command talk bench quantize libwhisper.a libwhisper.so
 
 #
 # Examples
@@ -304,6 +304,9 @@ quantize: examples/quantize/quantize.cpp ggml.o $(WHISPER_OBJ) $(SRC_COMMON)
 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
 $(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
 
+streammq: examples/streammq/streammq.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
+$(CXX) $(CXXFLAGS) examples/streammq/streammq.cpp -lzmq $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o streammq $(CC_SDL) $(LDFLAGS)
+
 command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
 $(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
 

diff --git a/README.md b/README.md
@@ -539,6 +539,24 @@ make stream
 
 https://user-images.usercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
 
+## Real-time audio with ZMQ
+
+Install `zmq` (e.g., `brew install zmq sdl2`).
+
+```sh
+make streammq
+./streammq --step -1 -m models/ggml-large.bin -t 8
+```
+
+I find it useful to use the basic VAD, so I use step `-1` instead of the above example.
+
+If make fails with being unable to find `zmq.h` file, I had to have these env vars set on my M1 mac:
+
+```sh
+export C_INCLUDE_PATH=$C_INCLUDE_PATH:/opt/homebrew/include
+export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/opt/homebrew/include
+```
+
 ## Confidence color-coding
 
 Adding the `--print-colors` argument will print the transcribed text using an experimental color coding strategy

diff --git a/examples/streammq/CMakeLists.txt b/examples/streammq/CMakeLists.txt
@@ -0,0 +1,12 @@
+if (WHISPER_SUPPORT_SDL2)
+# stream
+set(TARGET stream)
+add_executable(${TARGET} stream.cpp)
+
+find_package(cppzmq)
+include(DefaultTargetOptions)
+
+target_link_libraries(${TARGET} PRIVATE common cppzmq common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
+endif ()
+
+
diff --git a/examples/streammq/README.md b/examples/streammq/README.md
@@ -0,0 +1,44 @@
+# stream
+
+This is a naive example of performing real-time inference on audio from your microphone.
+The `stream` tool samples the audio every half a second and runs the transcription continously.
+More info is available in [issue #10](https://.com/ggerganov/whisper.cpp/issues/10).
+
+```java
+./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
+```
+
+https://user-images.usercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
+
+## Sliding window mode with VAD
+
+Setting the `--step` argument to `0` enables the sliding window mode:
+
+```java
+./stream -m ./models/ggml-small.en.bin -t 6 --step 0 --length 30000 -vth 0.6
+```
+
+In this mode, the tool will transcribe only after some speech activity is detected. A very
+basic VAD detector is used, but in theory a more sophisticated approach can be added. The
+`-vth` argument determines the VAD threshold - higher values will make it detect silence more often.
+It's best to tune it to the specific use case, but a value around `0.6` should be OK in general.
+When silence is detected, it will transcribe the last `--length` milliseconds of audio and output
+a transcription block that is suitable for parsing.
+
+## Building
+
+The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
+
+```bash
+# Install SDL2 on Linux
+sudo apt-get install libsdl2-dev
+
+# Install SDL2 on Mac OS
+brew install sdl2
+
+make stream
+```
+
+## Web version
+
+This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)