Directory structure:
└── examples/
├── README.md
├── abort-reload/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── get_started.html
│ └── get_started.js
├── cache-usage/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── cache_usage.html
│ └── cache_usage.ts
├── chrome-extension/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── content.js
│ ├── example.html
│ ├── manifest.json
│ ├── manifest_v2.json
│ ├── popup.css
│ ├── popup.html
│ └── popup.ts
├── chrome-extension-webgpu-service-worker/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── background.ts
│ ├── content.js
│ ├── example.html
│ ├── manifest.json
│ ├── popup.css
│ ├── popup.html
│ └── popup.ts
├── embeddings/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── embeddings.html
│ └── embeddings.ts
├── function-calling/
│ ├── README.md
│ ├── function-calling-manual/
│ │ ├── README.md
│ │ ├── package.json
│ │ └── src/
│ │ ├── function_calling_manual.html
│ │ └── function_calling_manual.ts
│ └── function-calling-openai/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── function_calling_openai.html
│ └── function_calling_openai.ts
├── get-started/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── get_started.html
│ └── get_started.ts
├── get-started-latency-breakdown/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── get_started_latency_breakdown.html
│ └── get_started_latency_breakdown.ts
├── get-started-web-worker/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── get_started.html
│ ├── main.ts
│ └── worker.ts
├── json-mode/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── json_mode.html
│ └── json_mode.ts
├── json-schema/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── json_schema.html
│ └── json_schema.ts
├── logit-processor/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── logit_processor.html
│ ├── logit_processor.ts
│ ├── my_logit_processor.ts
│ └── worker.ts
├── multi-models/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── main.ts
│ ├── multi_models.html
│ └── worker.ts
├── multi-round-chat/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── multi_round_chat.html
│ └── multi_round_chat.ts
├── next-simple-chat/
│ ├── README.md
│ ├── next.config.js
│ ├── package.json
│ ├── postcss.config.js
│ ├── tailwind.config.js
│ ├── tsconfig.json
│ └── src/
│ ├── pages/
│ │ ├── _app.tsx
│ │ ├── _document.tsx
│ │ ├── index.tsx
│ │ └── api/
│ │ └── hello.ts
│ ├── styles/
│ │ └── globals.css
│ └── utils/
│ ├── chat_component.tsx
│ └── chat_ui.ts
├── qwen3/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── qwen3_example.html
│ └── qwen3_example.ts
├── seed-to-reproduce/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── seed.html
│ └── seed.ts
├── service-worker/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── index.html
│ ├── main.ts
│ └── sw.ts
├── simple-chat-js/
│ ├── index.css
│ ├── index.html
│ └── index.js
├── simple-chat-ts/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── gh-config.js
│ ├── llm_chat.css
│ ├── llm_chat.html
│ ├── simple_chat.ts
│ └── worker.ts
├── simple-chat-upload/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── gh-config.js
│ ├── llm_chat.css
│ ├── llm_chat.html
│ ├── simple_chat.ts
│ └── worker.ts
├── streaming/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── streaming.html
│ └── streaming.ts
├── text-completion/
│ ├── README.md
│ ├── package.json
│ └── src/
│ ├── text_completion.html
│ └── text_completion.ts
└── vision-model/
├── README.md
├── package.json
└── src/
├── utils.ts
├── vision_model.html
├── vision_model.ts
└── worker.ts
================================================
FILE: examples/README.md
================================================
# Awesome WebLLM
This page contains a curated list of examples, tutorials, blogs about WebLLM usecases.
Please send a pull request if you find things that belongs to here.
## Example Projects
Note that all examples below run in-browser and use WebGPU as a backend.
#### Project List
- [get-started](get-started): minimum get started example with chat completion.
[](https://jsfiddle.net/neetnestor/yac9gbwf/)
[](https://codepen.io/neetnestor/pen/NWVdgey)
- [simple-chat-js](simple-chat-js): a mininum and complete chat bot app in vanilla JavaScript.
[](https://jsfiddle.net/neetnestor/4nmgvsa2/)
[](https://codepen.io/neetnestor/pen/vYwgZaG)
- [simple-chat-ts](simple-chat-ts): a mininum and complete chat bot app in TypeScript.
- [get-started-web-worker](get-started-web-worker): same as get-started, but using web worker.
- [next-simple-chat](next-simple-chat): a mininum and complete chat bot app with [Next.js](https://nextjs.org/).
- [multi-round-chat](multi-round-chat): while APIs are functional, we internally optimize so that multi round chat usage can reuse KV cache
- [text-completion](text-completion): demonstrates API `engine.completions.create()`, which is pure text completion with no conversation, as opposed to `engine.chat.completions.create()`
- [embeddings](embeddings): demonstrates API `engine.embeddings.create()`, integration with `EmbeddingsInterface` and `MemoryVectorStore` of [Langchain.js](js.langchain.com), and RAG with Langchain.js using WebLLM for both LLM and Embedding in a single engine
- [multi-models](multi-models): demonstrates loading multiple models in a single engine concurrently
#### Advanced OpenAI API Capabilities
These examples demonstrate various capabilities via WebLLM's OpenAI-like API.
- [streaming](streaming): return output as chunks in real-time in the form of an AsyncGenerator
- [json-mode](json-mode): efficiently ensure output is in json format, see [OpenAI Reference](https://platform.openai.com/docs/guides/text-generation/chat-completions-api) for more.
- [json-schema](json-schema): besides guaranteeing output to be in JSON, ensure output to adhere to a specific JSON schema specified the user
- [seed-to-reproduce](seed-to-reproduce): use seeding to ensure reproducible output with fields `seed`.
- [function-calling](function-calling) (WIP): function calling with fields `tools` and `tool_choice` (with preliminary support).
- [vision-model](vision-model): process request with image input using Vision Language Model (e.g. Phi3.5-vision)
#### Chrome Extension
- [chrome-extension](chrome-extension): chrome extension that does not have a persistent background
- [chrome-extension-webgpu-service-worker](chrome-extension-webgpu-service-worker): chrome extension using service worker, hence having a persistent background
#### Others
- [logit-processor](logit-processor): while `logit_bias` is supported, we additionally support stateful logit processing where users can specify their own rules. We also expose low-level API `forwardTokensAndSample()`.
- [cache-usage](cache-usage): demonstrates how WebLLM supports both the [Cache API](https://developer.mozilla.org/en-US/docs/Web/API/Cache) and [IndexedDB cache](https://developer.mozilla.org/en-US/docs/Web/API/IndexedDB_API), and
users can pick with `appConfig.useIndexedDBCache`. Also demonstrates various cache utils such as checking
whether a model is cached, deleting a model's weights from cache, deleting a model library wasm from cache, etc.
- [simple-chat-upload](simple-chat-upload): demonstrates how to upload local models to WebLLM instead of downloading via a URL link
## Demo Spaces
- [web-llm-embed](https://huggingface.co/spaces/matthoffner/web-llm-embed): document chat prototype using react-llm with transformers.js embeddings
- [DeVinci](https://x6occ-biaaa-aaaai-acqzq-cai.icp0.io/): AI chat app based on WebLLM and hosted on decentralized cloud platform
================================================
FILE: examples/abort-reload/README.md
================================================
# WebLLM Get Started App
This folder provides a demo for cancelling model fetching after calling `engine.reload()`.
```bash
npm install
npm start
```
Note if you would like to hack WebLLM core package.
You can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/abort-reload/package.json
================================================
{
"name": "get-started",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/get_started.html --port 8887",
"build": "parcel build src/get_started.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/abort-reload/src/get_started.html
================================================
WebLLM Test Page
Open console to see output
Prompt
Response
================================================
FILE: examples/abort-reload/src/get_started.js
================================================
import * as webllm from "@mlc-ai/web-llm";
import { error } from "loglevel";
let engine;
function setLabel(id, text) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
async function main() {
const initProgressCallback = (report) => {
console.log(report.text);
setLabel("init-label", report.text);
};
// Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts`
const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
engine = new webllm.MLCEngine({
initProgressCallback,
});
engine.reload(selectedModel);
}
main();
setTimeout(() => {
console.log("calling unload");
engine.unload().catch((err) => {
console.log(err);
});
}, 5000);
================================================
FILE: examples/cache-usage/README.md
================================================
# WebLLM Cache Usage
WebLLM supports both the Cache API and IndexedDB, which you can specify via `AppConfig.useIndexedDBCache`.
This folder provides an example on how Cache and IndexedDB Cache are used in WebLLM. We also
demonstrate the utility cache functions such as deleting models, checking if models are in cache, etc.
For more information about the two caches, see: https://developer.mozilla.org/en-US/docs/Web/API/Storage_API/Storage_quotas_and_eviction_criteria#what_technologies_store_data_in_the_browser.
To inspect the downloaded artifacts in your browser, open up developer console, go to application,
and you will find the artifacts under either `IndexedDB` or `Cache storage`.
To run the exapmle, you can do the following steps under this folder
```bash
npm install
npm start
```
Note if you would like to hack WebLLM core package.
You can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/cache-usage/package.json
================================================
{
"name": "cache-usage",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/cache_usage.html --port 8888",
"build": "parcel build src/cache_usage.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/cache-usage/src/cache_usage.html
================================================
WebLLM Test Page
Open console to see output
Prompt
Response
================================================
FILE: examples/cache-usage/src/cache_usage.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
async function main() {
const appConfig = webllm.prebuiltAppConfig;
// CHANGE THIS TO SEE EFFECTS OF BOTH, CODE BELOW DO NOT NEED TO CHANGE
appConfig.useIndexedDBCache = true;
if (appConfig.useIndexedDBCache) {
console.log("Using IndexedDB Cache");
} else {
console.log("Using Cache API");
}
// 1. This triggers downloading and caching the model with either Cache or IndexedDB Cache
const selectedModel = "phi-2-q4f16_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{ initProgressCallback: initProgressCallback, appConfig: appConfig },
);
const request: webllm.ChatCompletionRequest = {
stream: false,
messages: [
{
role: "user",
content: "Write an analogy between mathematics and a lighthouse.",
},
],
n: 1,
};
let reply = await engine.chat.completions.create(request);
console.log(reply);
// 2. Check whether model weights are cached
let modelCached = await webllm.hasModelInCache(selectedModel, appConfig);
console.log("hasModelInCache: ", modelCached);
if (!modelCached) {
throw Error("Expect hasModelInCache() to be true, but got: " + modelCached);
}
// 3. We reload, and we should see this time it is much faster because the weights are cached.
console.log("Reload model start");
await engine.reload(selectedModel);
console.log("Reload model end");
reply = await engine.chat.completions.create(request);
console.log(reply);
// 4. Delete every thing about this model from cache
// You can also delete only the model library wasm, only the model weights, or only the config file
await webllm.deleteModelAllInfoInCache(selectedModel, appConfig);
modelCached = await webllm.hasModelInCache(selectedModel, appConfig);
console.log("After deletion, hasModelInCache: ", modelCached);
if (modelCached) {
throw Error(
"Expect hasModelInCache() to be false, but got: " + modelCached,
);
}
// 5. If we reload, we should expect the model to start downloading again
console.log("Reload model start");
await engine.reload(selectedModel);
console.log("Reload model end");
reply = await engine.chat.completions.create(request);
console.log(reply);
}
main();
================================================
FILE: examples/chrome-extension/README.md
================================================
# WebLLM Chrome Extension

To run the extension, do the following steps under this folder
```bash
npm install
npm run build
```
This will create a new directory at `chrome-extension/dist/`. To load the extension into Chrome, go to Extensions > Manage Extensions and select Load Unpacked. Add the `chrome-extension/dist/` directory. You can now pin the extension to your toolbar and use the drop-down menu to chat with your favorite model!
================================================
FILE: examples/chrome-extension/package.json
================================================
{
"name": "chrome-extension",
"version": "1.0.1",
"description": "",
"private": true,
"scripts": {
"build": "parcel build src/manifest.json --config @parcel/config-webextension"
},
"author": "",
"license": "ISC",
"devDependencies": {
"@parcel/config-webextension": "^2.9.3",
"@types/chrome": "^0.0.242",
"buffer": "^6.0.3",
"parcel": "^2.9.3",
"process": "^0.11.10",
"url": "^0.11.1"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80",
"progressbar.js": "^1.1.0"
}
}
================================================
FILE: examples/chrome-extension/src/content.js
================================================
// Only the content script is able to access the DOM
chrome.runtime.onConnect.addListener(function (port) {
port.onMessage.addListener(function (msg) {
port.postMessage({ contents: document.body.innerText });
});
});
================================================
FILE: examples/chrome-extension/src/example.html
================================================
In the year 2154, humanity had colonized several planets in the distant reaches
of the galaxy. The planet of Xylophia-IV was one of the most remote and
inhospitable, with temperatures often dropping to -200 degrees Celsius. Despite
these harsh conditions, a team of scientists had established a research station
on the planet to study the unique geological formations and exotic flora and
fauna. One day, while conducting a routine survey of the planet's surface, the
team discovered an strange object buried deep in the ice. As they examined it
closer, they realized it was a small, metallic capsule with a glowing blue
symbol etched onto its surface. The team's leader, a brilliant scientist named
Dr. Maria Rodriguez, was immediately intrigued by the capsule's mysterious
origins. She ordered her team to bring it back to the research station for
further analysis. After weeks of studying the capsule, the team finally cracked
the code to the symbol etched onto its surface. It was a message from an alien
race, warning Earth of an impending attack from an unknown threat. The team was
shocked and dismayed by the news, but they knew they had to act quickly to warn
the rest of humanity. They transmitted the message to the nearest space station,
which relayed it to Earth's government. As the threat of attack loomed near, the
team remained on high alert, ready to face whatever dangers lay ahead. They had
uncovered a secrets of the universe, and now they were determined to protect
their planet and its inhabitants at all costs.
================================================
FILE: examples/chrome-extension/src/manifest.json
================================================
{
"manifest_version": 3,
"name": "MLCBot",
"version": "0.1.1",
"description": "Chat with your browser",
"icons": {
"16": "icons/icon-16.png",
"32": "icons/icon-32.png",
"64": "icons/icon-64.png",
"128": "icons/icon-128.png"
},
"content_security_policy": {
"extension_pages": "style-src-elem 'self' https://cdnjs.cloudflare.com; font-src 'self' https://cdnjs.cloudflare.com; script-src 'self' 'wasm-unsafe-eval'; default-src 'self' data:; connect-src 'self' data: http://localhost:8000 https://huggingface.co https://cdn-lfs.huggingface.co https://cdn-lfs-us-1.huggingface.co https://raw.githubusercontent.com https://cdn-lfs-us-1.hf.co"
},
"action": {
"default_title": "MLCBot",
"default_popup": "popup.html"
},
"content_scripts": [
{
"matches": [""],
"js": ["content.js"]
}
],
"permissions": ["storage", "tabs", "webNavigation", "activeTab", "scripting"],
"host_permissions": ["http://*/", "https://*/"]
}
================================================
FILE: examples/chrome-extension/src/manifest_v2.json
================================================
{
"manifest_version": 2,
"name": "MLCBot",
"version": "0.1.0",
"description": "Chat with your browser",
"icons": {
"16": "icons/icon-16.png",
"32": "icons/icon-32.png",
"64": "icons/icon-64.png",
"128": "icons/icon-128.png"
},
"content_security_policy": "style-src-elem 'self' https://cdnjs.cloudflare.com; font-src 'self' https://cdnjs.cloudflare.com; script-src 'self' 'unsafe-eval' 'wasm-unsafe-eval'; default-src 'self' data:; connect-src 'self' data: http://localhost:8000 https://huggingface.co https://cdn-lfs.huggingface.co https://raw.githubusercontent.com https://cdn-lfs-us-1.hf.co",
"browser_action": {
"default_popup": "popup.html"
},
"content_scripts": [
{
"matches": [""],
"js": ["content.js"]
}
],
"permissions": ["storage", "tabs", "webNavigation", "activeTab"]
}
================================================
FILE: examples/chrome-extension/src/popup.css
================================================
*,
*::before,
*::after {
margin: 0;
padding: 0;
box-sizing: border-box;
}
html {
font-family:
-apple-system,
BlinkMacSystemFont,
Segoe UI,
Helvetica,
Arial,
sans-serif;
color: #222;
}
body {
margin: 0;
padding: 0.5rem;
background-color: #778da9;
width: 335px;
font-size: small;
}
p {
margin: 0;
}
/* LOADING BAR */
#loadingContainer {
margin-bottom: 15px;
width: 315px;
height: 8px;
}
/* INPUT AREA */
#query-input {
border: 1px solid #ccc;
border-radius: 4px;
}
.input-container {
display: flex;
flex-direction: row;
align-items: center;
}
.input-container input {
width: 100%;
outline: none;
padding: 0.5rem;
margin-right: 0.5rem;
}
/* BUTTON */
.btn {
background-color: #1b263b;
color: white;
font-size: small;
cursor: pointer;
border-radius: 4px;
border: none;
padding: 0.5rem;
}
.btn:hover {
background-color: #d0d0d0;
}
.btn:disabled {
background-color: #a7a7a7;
color: rgb(255, 255, 255);
cursor: default;
}
.btn img {
width: 1rem;
height: 1rem;
}
/* LOADING */
.stage {
display: flex;
justify-content: center;
align-items: center;
position: relative;
margin: 0 -5%;
overflow: hidden;
}
#loading-indicator {
display: none;
color: white;
margin-top: 0.5rem;
}
.dot-flashing {
position: relative;
width: 10px;
height: 10px;
border-radius: 5px;
background-color: #1b263b;
color: #1b263b;
animation: dot-flashing 0.4s infinite linear alternate;
animation-delay: 0.2s;
}
.dot-flashing::before,
.dot-flashing::after {
content: "";
display: inline-block;
position: absolute;
top: 0;
}
.dot-flashing::before {
left: -15px;
width: 10px;
height: 10px;
border-radius: 5px;
background-color: #1b263b;
color: #1b263b;
animation: dot-flashing 0.4s infinite alternate;
animation-delay: 0s;
}
.dot-flashing::after {
left: 15px;
width: 10px;
height: 10px;
border-radius: 5px;
background-color: #1b263b;
color: #1b263b;
animation: dot-flashing 0.4s infinite alternate;
animation-delay: 0.4s;
}
@keyframes dot-flashing {
0% {
background-color: #1b263b;
}
50%,
100% {
background-color: #415a77;
}
}
/* ANSWERS */
#queriesAnswersContainer {
display: block;
color: white;
margin-top: 0.5rem;
}
#answer {
color: #333333;
}
#answerWrapper {
display: none;
background-color: #ffd166;
border-radius: 8px;
padding: 0.5rem;
margin-top: 0.5rem;
}
.queriesAnswers {
border-radius: 8px;
background-color: #ffd166;
padding: 0.5rem;
color: #333333;
}
#lastQuery {
color: rgb(188, 188, 188);
}
#lastAnswer {
color: white;
margin-top: 0.5rem;
}
#lastRequest {
padding: 0.5rem;
margin-top: 0.5rem;
background-color: #333333;
border-radius: 4px;
}
/* ANSWER OPTIONS */
.timeStamp {
color: #9a8c98;
}
.copyRow {
display: flex;
flex-direction: row;
align-items: end;
justify-content: space-between;
color: #a7a7a7;
margin-top: 0.5rem;
}
.copyText {
display: none;
color: #a7a7a7;
margin-right: 0.5rem;
}
.copyButton {
color: #415a77;
background-color: transparent;
border: none;
cursor: pointer;
padding: 0;
margin-left: 0.5rem;
}
.copyButton:hover {
color: #5e80a7;
background-color: transparent;
}
.removeButton {
color: #415a77;
background-color: transparent;
border: none;
cursor: pointer;
padding: 0;
}
.removeButton:hover {
color: #5e80a7;
background-color: transparent;
}
================================================
FILE: examples/chrome-extension/src/popup.html
================================================
Chatbot
Initializing model...
================================================
FILE: examples/chrome-extension/src/popup.ts
================================================
"use strict";
// This code is partially adapted from the openai-chatgpt-chrome-extension repo:
// https://github.com/jessedi0n/openai-chatgpt-chrome-extension
import "./popup.css";
import {
MLCEngineInterface,
InitProgressReport,
CreateMLCEngine,
ChatCompletionMessageParam,
prebuiltAppConfig,
} from "@mlc-ai/web-llm";
import { ProgressBar, Line } from "progressbar.js";
// modified setLabel to not throw error
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label != null) {
label.innerText = text;
}
}
function getElementAndCheck(id: string): HTMLElement {
const element = document.getElementById(id);
if (element == null) {
throw Error("Cannot find element " + id);
}
return element;
}
const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms));
const queryInput = getElementAndCheck("query-input")!;
const submitButton = getElementAndCheck("submit-button")!;
const modelName = getElementAndCheck("model-name");
let context = "";
let modelDisplayName = "";
// throws runtime.lastError if you refresh extension AND try to access a webpage that is already open
fetchPageContents();
(submitButton).disabled = true;
let progressBar: ProgressBar = new Line("#loadingContainer", {
strokeWidth: 4,
easing: "easeInOut",
duration: 1400,
color: "#ffd166",
trailColor: "#eee",
trailWidth: 1,
svgStyle: { width: "100%", height: "100%" },
});
let isLoadingParams = true;
let initProgressCallback = (report: InitProgressReport) => {
setLabel("init-label", report.text);
progressBar.animate(report.progress, {
duration: 50,
});
if (report.progress == 1.0) {
enableInputs();
}
};
// initially selected model
let selectedModel = "Qwen2-0.5B-Instruct-q4f16_1-MLC";
// populate model-selection
const modelSelector = getElementAndCheck(
"model-selection",
) as HTMLSelectElement;
for (let i = 0; i < prebuiltAppConfig.model_list.length; ++i) {
const model = prebuiltAppConfig.model_list[i];
const opt = document.createElement("option");
opt.value = model.model_id;
opt.innerHTML = model.model_id;
opt.selected = false;
// set initial selection as the initially selected model
if (model.model_id == selectedModel) {
opt.selected = true;
}
modelSelector.appendChild(opt);
}
modelName.innerText = "Loading initial model...";
const engine: MLCEngineInterface = await CreateMLCEngine(selectedModel, {
initProgressCallback: initProgressCallback,
});
modelName.innerText = "Now chatting with " + modelDisplayName;
let chatHistory: ChatCompletionMessageParam[] = [];
function enableInputs() {
if (isLoadingParams) {
sleep(500);
isLoadingParams = false;
}
// remove loading bar and loading bar descriptors, if exists
const initLabel = document.getElementById("init-label");
initLabel?.remove();
const loadingBarContainer = document.getElementById("loadingContainer")!;
loadingBarContainer?.remove();
queryInput.focus();
const modelNameArray = selectedModel.split("-");
modelDisplayName = modelNameArray[0];
let j = 1;
while (j < modelNameArray.length && modelNameArray[j][0] != "q") {
modelDisplayName = modelDisplayName + "-" + modelNameArray[j];
j++;
}
}
let requestInProgress = false;
// Disable submit button if input field is empty
queryInput.addEventListener("keyup", () => {
if (
(queryInput).value === "" ||
requestInProgress ||
isLoadingParams
) {
(submitButton).disabled = true;
} else {
(submitButton).disabled = false;
}
});
// If user presses enter, click submit button
queryInput.addEventListener("keyup", (event) => {
if (event.code === "Enter") {
event.preventDefault();
submitButton.click();
}
});
// Listen for clicks on submit button
async function handleClick() {
requestInProgress = true;
(submitButton).disabled = true;
// Get the message from the input field
const message = (queryInput).value;
console.log("message", message);
// Clear the answer
document.getElementById("answer")!.innerHTML = "";
// Hide the answer
document.getElementById("answerWrapper")!.style.display = "none";
// Show the loading indicator
document.getElementById("loading-indicator")!.style.display = "block";
// Generate response
let inp = message;
if (context.length > 0) {
inp =
"Use only the following context when answering the question at the end. Don't use any other knowledge.\n" +
context +
"\n\nQuestion: " +
message +
"\n\nHelpful Answer: ";
}
console.log("Input:", inp);
chatHistory.push({ role: "user", content: inp });
let curMessage = "";
const completion = await engine.chat.completions.create({
stream: true,
messages: chatHistory,
});
for await (const chunk of completion) {
const curDelta = chunk.choices[0].delta.content;
if (curDelta) {
curMessage += curDelta;
}
updateAnswer(curMessage);
}
const response = await engine.getMessage();
chatHistory.push({ role: "assistant", content: await engine.getMessage() });
console.log("response", response);
requestInProgress = false;
(submitButton).disabled = false;
}
submitButton.addEventListener("click", handleClick);
// listen for changes in modelSelector
async function handleSelectChange() {
if (isLoadingParams) {
return;
}
modelName.innerText = "";
const initLabel = document.createElement("p");
initLabel.id = "init-label";
initLabel.innerText = "Initializing model...";
const loadingContainer = document.createElement("div");
loadingContainer.id = "loadingContainer";
const loadingBox = getElementAndCheck("loadingBox");
loadingBox.appendChild(initLabel);
loadingBox.appendChild(loadingContainer);
isLoadingParams = true;
(submitButton).disabled = true;
if (requestInProgress) {
engine.interruptGenerate();
}
engine.resetChat();
chatHistory = [];
await engine.unload();
selectedModel = modelSelector.value;
progressBar = new Line("#loadingContainer", {
strokeWidth: 4,
easing: "easeInOut",
duration: 1400,
color: "#ffd166",
trailColor: "#eee",
trailWidth: 1,
svgStyle: { width: "100%", height: "100%" },
});
initProgressCallback = (report: InitProgressReport) => {
setLabel("init-label", report.text);
progressBar.animate(report.progress, {
duration: 50,
});
if (report.progress == 1.0) {
enableInputs();
}
};
engine.setInitProgressCallback(initProgressCallback);
requestInProgress = true;
modelName.innerText = "Reloading with new model...";
await engine.reload(selectedModel);
requestInProgress = false;
modelName.innerText = "Now chatting with " + modelDisplayName;
}
modelSelector.addEventListener("change", handleSelectChange);
// Listen for messages from the background script
chrome.runtime.onMessage.addListener(({ answer, error }) => {
if (answer) {
updateAnswer(answer);
}
});
function updateAnswer(answer: string) {
// Show answer
document.getElementById("answerWrapper")!.style.display = "block";
const answerWithBreaks = answer.replace(/\n/g, " ");
document.getElementById("answer")!.innerHTML = answerWithBreaks;
// Add event listener to copy button
document.getElementById("copyAnswer")!.addEventListener("click", () => {
// Get the answer text
const answerText = answer;
// Copy the answer text to the clipboard
navigator.clipboard
.writeText(answerText)
.then(() => console.log("Answer text copied to clipboard"))
.catch((err) => console.error("Could not copy text: ", err));
});
const options: Intl.DateTimeFormatOptions = {
month: "short",
day: "2-digit",
hour: "2-digit",
minute: "2-digit",
second: "2-digit",
};
const time = new Date().toLocaleString("en-US", options);
// Update timestamp
document.getElementById("timestamp")!.innerText = time;
// Hide loading indicator
document.getElementById("loading-indicator")!.style.display = "none";
}
function fetchPageContents() {
chrome.tabs.query({ currentWindow: true, active: true }, function (tabs) {
const port = chrome.tabs.connect(tabs[0].id, { name: "channelName" });
port.postMessage({});
port.onMessage.addListener(function (msg) {
console.log("Page contents:", msg.contents);
context = msg.contents;
});
});
}
================================================
FILE: examples/chrome-extension-webgpu-service-worker/README.md
================================================
# WebLLM Chrome Extension using WebGPU Running on Service Worker

> [!WARNING]
> Service worker support in WebGPU is enabled by default in [Chrome 124](https://chromiumdash.appspot.com/commit/8d78510e4aca5ac3cd8ee4a33e96b404eaa43246).
> If you are using Chrome 123, go to `chrome://flags/#enable-experimental-web-platform-features`, enable the `#enable-experimental-web-platform-features` flag, and **relaunch the browser**.
This example shows how we can create a Chrome extension using WebGPU and service worker.
- The project structure is as follows:
- `manifest.json`: A required file that lists important information about the structure and behavior of that extension. Here we are using manifest V3.
- `popup.ts`: Script of the extension pop-up window.
- `background.ts`: Script of the service worker. An extension service worker is loaded when it is needed, and unloaded when it goes dormant.
- `content.js`: Content script that interacts with DOM.
- Run
```bash
npm install
npm run build
```
This will create a new directory at `./dist/`. To load the extension into Chrome, go to Extensions > Manage Extensions and select Load Unpacked. Add the `./dist/` directory. You can now pin the extension to your toolbar and use it to chat with your favorite model!
**Note**: This example disables chatting using the contents of the active tab by default.
To enable it, set `useContext` in `popup.ts` to `true`. More info about this feature can be found
[here](https://github.com/mlc-ai/web-llm/pull/190).
However, if the web content is too large, it might run into issues. We recommend using `example.html` to
test this feature.
================================================
FILE: examples/chrome-extension-webgpu-service-worker/package.json
================================================
{
"name": "chrome-extension",
"version": "1.0.0",
"description": "",
"private": true,
"scripts": {
"build": "parcel build src/manifest.json --config @parcel/config-webextension"
},
"author": "",
"license": "ISC",
"devDependencies": {
"@parcel/config-webextension": "^2.9.3",
"@types/chrome": "^0.0.242",
"buffer": "^6.0.3",
"parcel": "^2.9.3",
"process": "^0.11.10",
"url": "^0.11.1"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80",
"progressbar.js": "^1.1.0"
}
}
================================================
FILE: examples/chrome-extension-webgpu-service-worker/src/background.ts
================================================
import { ExtensionServiceWorkerMLCEngineHandler } from "@mlc-ai/web-llm";
// Hookup an engine to a service worker handler
let handler;
chrome.runtime.onConnect.addListener(function (port) {
console.assert(port.name === "web_llm_service_worker");
if (handler === undefined) {
handler = new ExtensionServiceWorkerMLCEngineHandler(port);
} else {
handler.setPort(port);
}
port.onMessage.addListener(handler.onmessage.bind(handler));
});
================================================
FILE: examples/chrome-extension-webgpu-service-worker/src/content.js
================================================
// Only the content script is able to access the DOM
chrome.runtime.onConnect.addListener(function (port) {
port.onMessage.addListener(function (msg) {
port.postMessage({ contents: document.body.innerHTML });
});
});
================================================
FILE: examples/chrome-extension-webgpu-service-worker/src/example.html
================================================
In the year 2154, humanity had colonized several planets in the distant reaches
of the galaxy. The planet of Xylophia-IV was one of the most remote and
inhospitable, with temperatures often dropping to -200 degrees Celsius. Despite
these harsh conditions, a team of scientists had established a research station
on the planet to study the unique geological formations and exotic flora and
fauna. One day, while conducting a routine survey of the planet's surface, the
team discovered an strange object buried deep in the ice. As they examined it
closer, they realized it was a small, metallic capsule with a glowing blue
symbol etched onto its surface. The team's leader, a brilliant scientist named
Dr. Maria Rodriguez, was immediately intrigued by the capsule's mysterious
origins. She ordered her team to bring it back to the research station for
further analysis. After weeks of studying the capsule, the team finally cracked
the code to the symbol etched onto its surface. It was a message from an alien
race, warning Earth of an impending attack from an unknown threat. The team was
shocked and dismayed by the news, but they knew they had to act quickly to warn
the rest of humanity. They transmitted the message to the nearest space station,
which relayed it to Earth's government. As the threat of attack loomed near, the
team remained on high alert, ready to face whatever dangers lay ahead. They had
uncovered a secrets of the universe, and now they were determined to protect
their planet and its inhabitants at all costs.
================================================
FILE: examples/chrome-extension-webgpu-service-worker/src/manifest.json
================================================
{
"manifest_version": 3,
"name": "MLCBot",
"version": "0.1.0",
"description": "Chat with your browser",
"icons": {
"16": "icons/icon-16.png",
"32": "icons/icon-32.png",
"64": "icons/icon-64.png",
"128": "icons/icon-128.png"
},
"content_security_policy": {
"extension_pages": "style-src-elem 'self' https://cdnjs.cloudflare.com; font-src 'self' https://cdnjs.cloudflare.com; script-src 'self' 'wasm-unsafe-eval'; default-src 'self' data:; connect-src 'self' data: http://localhost:8000 https://huggingface.co https://cdn-lfs.huggingface.co https://cdn-lfs-us-1.huggingface.co https://raw.githubusercontent.com https://cdn-lfs-us-1.hf.co"
},
"action": {
"default_title": "MLCBot",
"default_popup": "popup.html"
},
"content_scripts": [
{
"matches": [""],
"js": ["content.js"]
}
],
"background": {
"service_worker": "background.ts",
"type": "module"
},
"permissions": ["storage", "tabs", "webNavigation"]
}
================================================
FILE: examples/chrome-extension-webgpu-service-worker/src/popup.css
================================================
*,
*::before,
*::after {
margin: 0;
padding: 0;
box-sizing: border-box;
}
html {
font-family:
-apple-system,
BlinkMacSystemFont,
Segoe UI,
Helvetica,
Arial,
sans-serif;
color: #222;
}
body {
margin: 0;
padding: 0.5rem;
background-color: #778da9;
width: 320px;
font-size: small;
}
p {
margin: 0;
}
/* LOADING BAR */
#loadingContainer {
margin-bottom: 15px;
width: 300px;
height: 8px;
}
/* INPUT AREA */
#query-input {
border: 1px solid #ccc;
border-radius: 4px;
}
.input-container {
display: flex;
flex-direction: row;
align-items: center;
}
.input-container input {
width: 100%;
outline: none;
padding: 0.5rem;
margin-right: 0.5rem;
}
/* SUBMIT BUTTON */
.btn {
background-color: #1b263b;
color: white;
font-size: small;
cursor: pointer;
border-radius: 4px;
border: none;
padding: 0.5rem;
}
.btn:hover {
background-color: #d0d0d0;
}
.btn:disabled {
background-color: #a7a7a7;
color: rgb(255, 255, 255);
cursor: default;
}
.btn img {
width: 1rem;
height: 1rem;
}
/* LOADING */
.stage {
display: flex;
justify-content: center;
align-items: center;
position: relative;
margin: 0 -5%;
overflow: hidden;
}
#loading-indicator {
display: none;
color: white;
margin-top: 0.5rem;
}
.dot-flashing {
position: relative;
width: 10px;
height: 10px;
border-radius: 5px;
background-color: #1b263b;
color: #1b263b;
animation: dot-flashing 0.4s infinite linear alternate;
animation-delay: 0.2s;
}
.dot-flashing::before,
.dot-flashing::after {
content: "";
display: inline-block;
position: absolute;
top: 0;
}
.dot-flashing::before {
left: -15px;
width: 10px;
height: 10px;
border-radius: 5px;
background-color: #1b263b;
color: #1b263b;
animation: dot-flashing 0.4s infinite alternate;
animation-delay: 0s;
}
.dot-flashing::after {
left: 15px;
width: 10px;
height: 10px;
border-radius: 5px;
background-color: #1b263b;
color: #1b263b;
animation: dot-flashing 0.4s infinite alternate;
animation-delay: 0.4s;
}
@keyframes dot-flashing {
0% {
background-color: #1b263b;
}
50%,
100% {
background-color: #415a77;
}
}
/* ANSWERS */
#queriesAnswersContainer {
display: block;
color: white;
margin-top: 0.5rem;
}
#answer {
color: #333333;
}
#answerWrapper {
display: none;
background-color: #ffd166;
border-radius: 8px;
padding: 0.5rem;
margin-top: 0.5rem;
}
.queriesAnswers {
border-radius: 8px;
background-color: #ffd166;
padding: 0.5rem;
color: #333333;
}
#lastQuery {
color: rgb(188, 188, 188);
}
#lastAnswer {
color: white;
margin-top: 0.5rem;
}
#lastRequest {
padding: 0.5rem;
margin-top: 0.5rem;
background-color: #333333;
border-radius: 4px;
}
/* ANSWER OPTIONS */
.timeStamp {
color: #9a8c98;
}
.copyRow {
display: flex;
flex-direction: row;
align-items: end;
justify-content: space-between;
color: #a7a7a7;
margin-top: 0.5rem;
}
.copyText {
display: none;
color: #a7a7a7;
margin-right: 0.5rem;
}
.copyButton {
color: #415a77;
background-color: transparent;
border: none;
cursor: pointer;
padding: 0;
margin-left: 0.5rem;
}
.copyButton:hover {
color: #5e80a7;
background-color: transparent;
}
.removeButton {
color: #415a77;
background-color: transparent;
border: none;
cursor: pointer;
padding: 0;
}
.removeButton:hover {
color: #5e80a7;
background-color: transparent;
}
================================================
FILE: examples/chrome-extension-webgpu-service-worker/src/popup.html
================================================
Chatbot
================================================
FILE: examples/chrome-extension-webgpu-service-worker/src/popup.ts
================================================
"use strict";
// This code is partially adapted from the openai-chatgpt-chrome-extension repo:
// https://github.com/jessedi0n/openai-chatgpt-chrome-extension
import "./popup.css";
import {
ChatCompletionMessageParam,
CreateExtensionServiceWorkerMLCEngine,
MLCEngineInterface,
InitProgressReport,
} from "@mlc-ai/web-llm";
import { ProgressBar, Line } from "progressbar.js";
/***************** UI elements *****************/
// Whether or not to use the content from the active tab as the context
const useContext = false;
const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms));
const queryInput = document.getElementById("query-input")!;
const submitButton = document.getElementById("submit-button")!;
let isLoadingParams = false;
(submitButton).disabled = true;
const progressBar: ProgressBar = new Line("#loadingContainer", {
strokeWidth: 4,
easing: "easeInOut",
duration: 1400,
color: "#ffd166",
trailColor: "#eee",
trailWidth: 1,
svgStyle: { width: "100%", height: "100%" },
});
/***************** Web-LLM MLCEngine Configuration *****************/
const initProgressCallback = (report: InitProgressReport) => {
progressBar.animate(report.progress, {
duration: 50,
});
if (report.progress == 1.0) {
enableInputs();
}
};
const engine: MLCEngineInterface = await CreateExtensionServiceWorkerMLCEngine(
"Qwen2-0.5B-Instruct-q4f16_1-MLC",
{ initProgressCallback: initProgressCallback },
);
const chatHistory: ChatCompletionMessageParam[] = [];
isLoadingParams = true;
function enableInputs() {
if (isLoadingParams) {
sleep(500);
(submitButton).disabled = false;
const loadingBarContainer = document.getElementById("loadingContainer")!;
loadingBarContainer.remove();
queryInput.focus();
isLoadingParams = false;
}
}
/***************** Event Listeners *****************/
// Disable submit button if input field is empty
queryInput.addEventListener("keyup", () => {
if ((queryInput).value === "") {
(submitButton).disabled = true;
} else {
(submitButton).disabled = false;
}
});
// If user presses enter, click submit button
queryInput.addEventListener("keyup", (event) => {
if (event.code === "Enter") {
event.preventDefault();
submitButton.click();
}
});
// Listen for clicks on submit button
async function handleClick() {
// Get the message from the input field
const message = (queryInput).value;
console.log("message", message);
chatHistory.push({ role: "user", content: message });
// Clear the answer
document.getElementById("answer")!.innerHTML = "";
// Hide the answer
document.getElementById("answerWrapper")!.style.display = "none";
// Show the loading indicator
document.getElementById("loading-indicator")!.style.display = "block";
// Send the chat completion message to the engine
let curMessage = "";
const completion = await engine.chat.completions.create({
stream: true,
messages: chatHistory,
});
// Update the answer as the model generates more text
for await (const chunk of completion) {
const curDelta = chunk.choices[0].delta.content;
if (curDelta) {
curMessage += curDelta;
}
updateAnswer(curMessage);
}
chatHistory.push({ role: "assistant", content: await engine.getMessage() });
}
submitButton.addEventListener("click", handleClick);
function updateAnswer(answer: string) {
// Show answer
document.getElementById("answerWrapper")!.style.display = "block";
const answerWithBreaks = answer.replace(/\n/g, " ");
document.getElementById("answer")!.innerHTML = answerWithBreaks;
// Add event listener to copy button
document.getElementById("copyAnswer")!.addEventListener("click", () => {
// Get the answer text
const answerText = answer;
// Copy the answer text to the clipboard
navigator.clipboard
.writeText(answerText)
.then(() => console.log("Answer text copied to clipboard"))
.catch((err) => console.error("Could not copy text: ", err));
});
const options: Intl.DateTimeFormatOptions = {
month: "short",
day: "2-digit",
hour: "2-digit",
minute: "2-digit",
second: "2-digit",
};
const time = new Date().toLocaleString("en-US", options);
// Update timestamp
document.getElementById("timestamp")!.innerText = time;
// Hide loading indicator
document.getElementById("loading-indicator")!.style.display = "none";
}
function fetchPageContents() {
chrome.tabs.query({ currentWindow: true, active: true }, function (tabs) {
if (tabs[0]?.id) {
const port = chrome.tabs.connect(tabs[0].id, { name: "channelName" });
port.postMessage({});
port.onMessage.addListener(function (msg) {
console.log("Page contents:", msg.contents);
chrome.runtime.sendMessage({ context: msg.contents });
});
}
});
}
// Grab the page contents when the popup is opened
window.onload = function () {
if (useContext) {
fetchPageContents();
}
};
================================================
FILE: examples/embeddings/README.md
================================================
# WebLLM Get Started App
This folder provides a minimum demo to show WebLLM API in a webapp setting.
To try it out, you can do the following steps under this folder
```bash
npm install
npm start
```
Note if you would like to hack WebLLM core package.
You can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/embeddings/package.json
================================================
{
"name": "embeddings-example",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/embeddings.html --port 8885",
"build": "parcel build src/embeddings.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80",
"langchain": "0.2.15"
}
}
================================================
FILE: examples/embeddings/src/embeddings.html
================================================
WebLLM Test Page
Open console to see output
Prompt
Response
================================================
FILE: examples/embeddings/src/embeddings.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
import { MemoryVectorStore } from "langchain/vectorstores/memory";
import type { EmbeddingsInterface } from "@langchain/core/embeddings";
import type { Document } from "@langchain/core/documents";
import { formatDocumentsAsString } from "langchain/util/document";
import { PromptTemplate } from "@langchain/core/prompts";
import {
RunnableSequence,
RunnablePassthrough,
} from "@langchain/core/runnables";
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
// For integration with Langchain
class WebLLMEmbeddings implements EmbeddingsInterface {
engine: webllm.MLCEngineInterface;
modelId: string;
constructor(engine: webllm.MLCEngineInterface, modelId: string) {
this.engine = engine;
this.modelId = modelId;
}
async _embed(texts: string[]): Promise {
const reply = await this.engine.embeddings.create({
input: texts,
model: this.modelId,
});
const result: number[][] = [];
for (let i = 0; i < texts.length; i++) {
result.push(reply.data[i].embedding);
}
return result;
}
async embedQuery(document: string): Promise {
return this._embed([document]).then((embeddings) => embeddings[0]);
}
async embedDocuments(documents: string[]): Promise {
return this._embed(documents);
}
}
// Prepare inputs
const documents_og = ["The Data Cloud!", "Mexico City of Course!"];
const queries_og = ["what is snowflake?", "Where can I get the best tacos?"];
const documents: string[] = [];
const queries: string[] = [];
const query_prefix =
"Represent this sentence for searching relevant passages: ";
// Process according to Snowflake model
documents_og.forEach(function (item, index) {
documents[index] = `[CLS] ${item} [SEP]`;
});
queries_og.forEach(function (item, index) {
queries[index] = `[CLS] ${query_prefix}${item} [SEP]`;
});
console.log("Formatted documents: ", documents);
console.log("Formatted queries: ", queries);
// Using webllm's API
async function webllmAPI() {
// b4 means the max batch size is compiled as 4. That is, the model can process 4 inputs in a
// batch. If given more than 4, the model will forward multiple times. The larger the max batch
// size, the more memory it consumes.
// const selectedModel = "snowflake-arctic-embed-m-q0f32-MLC-b32";
const selectedModel = "snowflake-arctic-embed-m-q0f32-MLC-b4";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{
initProgressCallback: initProgressCallback,
logLevel: "INFO", // specify the log level
},
);
const docReply = await engine.embeddings.create({ input: documents });
console.log(docReply);
console.log(docReply.usage);
const queryReply = await engine.embeddings.create({ input: queries });
console.log(queryReply);
console.log(queryReply.usage);
// Calculate similarity (we use langchain here, but any method works)
const vectorStore = await MemoryVectorStore.fromExistingIndex(
new WebLLMEmbeddings(engine, selectedModel),
);
// See score
for (let i = 0; i < queries_og.length; i++) {
console.log(`Similarity with: ${queries_og[i]}`);
for (let j = 0; j < documents_og.length; j++) {
const similarity = vectorStore.similarity(
queryReply.data[i].embedding,
docReply.data[j].embedding,
);
console.log(`${documents_og[j]}: ${similarity}`);
}
}
}
// Alternatively, integrating with Langchain's API
async function langchainAPI() {
// b4 means the max batch size is compiled as 4. That is, the model can process 4 inputs in a
// batch. If given more than 4, the model will forward multiple times. The larger the max batch
// size, the more memory it consumes.
// const selectedModel = "snowflake-arctic-embed-m-q0f32-MLC-b32";
const selectedModel = "snowflake-arctic-embed-m-q0f32-MLC-b4";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{
initProgressCallback: initProgressCallback,
logLevel: "INFO", // specify the log level
},
);
const vectorStore = await MemoryVectorStore.fromExistingIndex(
new WebLLMEmbeddings(engine, selectedModel),
);
const document0: Document = {
pageContent: documents[0],
metadata: {},
};
const document1: Document = {
pageContent: documents[1],
metadata: {},
};
await vectorStore.addDocuments([document0, document1]);
const similaritySearchResults0 = await vectorStore.similaritySearch(
queries[0],
1,
);
for (const doc of similaritySearchResults0) {
console.log(`* ${doc.pageContent}`);
}
const similaritySearchResults1 = await vectorStore.similaritySearch(
queries[1],
1,
);
for (const doc of similaritySearchResults1) {
console.log(`* ${doc.pageContent}`);
}
}
// RAG with Langchain.js using WebLLM for both LLM and Embedding in a single engine
// Followed https://js.langchain.com/v0.1/docs/expression_language/cookbook/retrieval/
// There are many possible ways to achieve RAG (e.g. degree of integration with Langchain,
// using WebWorker, etc.). We provide a minimal example here.
async function simpleRAG() {
// 0. Load both embedding model and LLM to a single WebLLM Engine
const embeddingModelId = "snowflake-arctic-embed-m-q0f32-MLC-b4";
const llmModelId = "gemma-2-2b-it-q4f32_1-MLC-1k";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
[embeddingModelId, llmModelId],
{
initProgressCallback: initProgressCallback,
logLevel: "INFO", // specify the log level
},
);
const vectorStore = await MemoryVectorStore.fromTexts(
["mitochondria is the powerhouse of the cell"],
[{ id: 1 }],
new WebLLMEmbeddings(engine, embeddingModelId),
);
const retriever = vectorStore.asRetriever();
const prompt =
PromptTemplate.fromTemplate(`Answer the question based only on the following context:
{context}
Question: {question}`);
const chain = RunnableSequence.from([
{
context: retriever.pipe(formatDocumentsAsString),
question: new RunnablePassthrough(),
},
prompt,
]);
const formattedPrompt = (
await chain.invoke("What is the powerhouse of the cell?")
).toString();
const reply = await engine.chat.completions.create({
messages: [{ role: "user", content: formattedPrompt }],
model: llmModelId,
});
console.log(reply.choices[0].message.content);
/*
"The powerhouse of the cell is the mitochondria."
*/
}
// Select one to run
// webllmAPI();
// langchainAPI();
simpleRAG();
================================================
FILE: examples/function-calling/README.md
================================================
### OpenAI API Demos - Function calling
This folder contains two main ways of using function calling with WebLLM.
`function-calling-manual` demonstrates how you can use function calling with Llama3.1 and Hermes2
without using the `tools`, `tool_choice`, and `tool_call` fields. This is the most flexible way and you can follow
the instruction given by the model releaser and iterate yourself on top of that. However, you need to do parsing on your own, which differs for each model. For instance, Hermes2 models use `` and `` to wrap around a tool call, which may be very different from other models' format.
`function-calling-openai` conforms to the OpenAI function calling usage, leveraging `tools`, `tool_choice`, and `tool_call`
fields. This is more usable, but sacrifices the flexibility since we have pre-defined system prompt
for this.
================================================
FILE: examples/function-calling/function-calling-manual/README.md
================================================
### Demos - Function calling
Run `npm install` first, followed by `npm start`.
Note if you would like to hack WebLLM core package,
you can change web-llm dependencies as `"file:../../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/function-calling/function-calling-manual/package.json
================================================
{
"name": "openai-api",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/function_calling_manual.html --port 8888",
"build": "parcel build src/function_calling_manual.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/function-calling/function-calling-manual/src/function_calling_manual.html
================================================
WebLLM Test Page
Open console to see output
================================================
FILE: examples/function-calling/function-calling-manual/src/function_calling_manual.ts
================================================
/* eslint-disable no-useless-escape */
import * as webllm from "@mlc-ai/web-llm";
// Common helper methods
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
// Same example as https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B#prompt-format-for-function-calling
async function hermes2_example() {
// 0. Setups
// Most manual function calling models specify the tools inside the system prompt
const system_prompt = `You are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: {"type": "function", "function": {"name": "get_stock_fundamentals", "description": "get_stock_fundamentals(symbol: str) -> dict - Get fundamental data for a given stock symbol using yfinance API.\\n\\n Args:\\n symbol (str): The stock symbol.\\n\\n Returns:\\n dict: A dictionary containing fundamental data.\\n Keys:\\n - \'symbol\': The stock symbol.\\n - \'company_name\': The long name of the company.\\n - \'sector\': The sector to which the company belongs.\\n - \'industry\': The industry to which the company belongs.\\n - \'market_cap\': The market capitalization of the company.\\n - \'pe_ratio\': The forward price-to-earnings ratio.\\n - \'pb_ratio\': The price-to-book ratio.\\n - \'dividend_yield\': The dividend yield.\\n - \'eps\': The trailing earnings per share.\\n - \'beta\': The beta value of the stock.\\n - \'52_week_high\': The 52-week high price of the stock.\\n - \'52_week_low\': The 52-week low price of the stock.", "parameters": {"type": "object", "properties": {"symbol": {"type": "string"}}, "required": ["symbol"]}}} Use the following pydantic model json schema for each tool call you will make: {"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"} For each function call return a json object with function name and arguments within XML tags as follows:\n\n{"arguments": , "name": }\n`;
// Same formatting for Hermes-2-Pro-Llama-3, Hermes-2-Theta-Llama-3
// const selectedModel = "Hermes-2-Theta-Llama-3-8B-q4f16_1-MLC";
const selectedModel = "Hermes-2-Pro-Llama-3-8B-q4f16_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{ initProgressCallback: initProgressCallback, logLevel: "INFO" },
);
const seed = 0;
// 1. First request, expect to generate tool call
const messages: webllm.ChatCompletionMessageParam[] = [
{ role: "system", content: system_prompt },
{
role: "user",
content: "Fetch the stock fundamentals data for Tesla (TSLA)",
},
];
const request1: webllm.ChatCompletionRequest = {
stream: false, // works with either streaming or non-streaming; code below assumes non-streaming
messages: messages,
seed: seed,
};
const reply1 = await engine.chat.completions.create(request1);
const response1 = reply1.choices[0].message.content;
console.log(reply1.usage);
console.log("Response 1: " + response1);
messages.push({ role: "assistant", content: response1 });
// \n{"arguments": {"symbol": "TSLA"}, "name": "get_stock_fundamentals"}\n
// 2. Call function on your own to get tool response
const tool_response = `\n{"name": "get_stock_fundamentals", "content": {'symbol': 'TSLA', 'company_name': 'Tesla, Inc.', 'sector': 'Consumer Cyclical', 'industry': 'Auto Manufacturers', 'market_cap': 611384164352, 'pe_ratio': 49.604652, 'pb_ratio': 9.762013, 'dividend_yield': None, 'eps': 4.3, 'beta': 2.427, '52_week_high': 299.29, '52_week_low': 152.37}}\n`;
messages.push({ role: "tool", content: tool_response, tool_call_id: "0" });
// 3. Get natural language response
const request2: webllm.ChatCompletionRequest = {
stream: false, // works with either streaming or non-streaming; code below assumes non-streaming
messages: messages,
seed: seed,
};
const reply2 = await engine.chat.completions.create(request2);
const response2 = reply2.choices[0].message.content;
messages.push({ role: "assistant", content: response2 });
console.log(reply2.usage);
console.log("Response 2: " + response2);
// 4. Another function call
messages.push({
role: "user",
content: "Now do another one with NVIDIA, symbol being NVDA.",
});
const request3: webllm.ChatCompletionRequest = {
stream: false, // works with either streaming or non-streaming; code below assumes non-streaming
messages: messages,
seed: seed,
};
const reply3 = await engine.chat.completions.create(request3);
const response3 = reply3.choices[0].message.content;
messages.push({ role: "assistant", content: response3 });
console.log(reply3.usage);
console.log("Response 3: " + response3);
// \n{"arguments": {"symbol": "NVDA"}, "name": "get_stock_fundamentals"}\n
}
// Similar example to https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#user-defined-custom-tool-calling
async function llama3_1_example() {
// Follows example, but tweaks the formatting with
const system_prompt = `Cutting Knowledge Date: December 2023
Today Date: 23 Jul 2024
# Tool Instructions
- When looking for real time information use relevant functions if available
You have access to the following functions:
{
"type": "function",
"function": {
"name": "get_current_temperature",
"description": "Get the current temperature at a location.",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The location to get the temperature for, in the format \"City, Country\""
}
},
"required": [
"location"
]
},
"return": {
"type": "number",
"description": "The current temperature at the specified location in the specified units, as a float."
}
}
}
{
"type": "function",
"function": {
"name": "send_message",
"description": "Send a message to a recipient.",
"parameters": {
"type": "object",
"properties": {
"recipient": {
"type": "string",
"description": "Name of the recipient of the message"
}
"content": {
"type": "string",
"description": "Content of the message"
}
},
"required": [
"recipient",
"content"
]
},
"return": {
"type": "None"
}
}
}
If a you choose to call a function ONLY reply in the following format:
{"name": function name, "parameters": dictionary of argument name and its value}
Here is an example,
{"name": "example_function_name", "parameters": {"example_name": "example_value"}}
Reminder:
- Function calls MUST follow the specified format and use BOTH and
- Required parameters MUST be specified
- Only call one function at a time
- When calling a function, do NOT add any other words, ONLY the function calling
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
You are a helpful Assistant.`;
const selectedModel = "Llama-3.1-8B-Instruct-q4f16_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{ initProgressCallback: initProgressCallback, logLevel: "INFO" },
);
const seed = 0;
// 1. First request, expect to generate tool call to get temperature of Paris
const messages: webllm.ChatCompletionMessageParam[] = [
{ role: "system", content: system_prompt },
{
role: "user",
content: "Hey, what's the temperature in Paris right now?",
},
];
const request1: webllm.ChatCompletionRequest = {
stream: false, // works with either streaming or non-streaming; code below assumes non-streaming
messages: messages,
seed: seed,
};
const reply1 = await engine.chat.completions.create(request1);
const response1 = reply1.choices[0].message.content;
console.log(reply1.usage);
console.log("Response 1: " + response1);
messages.push({ role: "assistant", content: response1 });
// {"name": "get_current_temperature", "parameters": {"location": "Paris, France"}}
// 2. Call function on your own to get tool response
const tool_response = `{"output": 22.5}`;
messages.push({ role: "tool", content: tool_response, tool_call_id: "0" });
// 3. Get natural language response
const request2: webllm.ChatCompletionRequest = {
stream: false, // works with either streaming or non-streaming; code below assumes non-streaming
messages: messages,
seed: seed,
};
const reply2 = await engine.chat.completions.create(request2);
const response2 = reply2.choices[0].message.content;
messages.push({ role: "assistant", content: response2 });
console.log(reply2.usage);
console.log("Response 2: " + response2);
// The current temperature in Paris is 22.5°C.
// 4. Make another request, expect model to call `send_message`
messages.push({
role: "user",
content: "Send a message to Tom to tell him this information.",
});
const request3: webllm.ChatCompletionRequest = {
stream: false, // works with either streaming or non-streaming; code below assumes non-streaming
messages: messages,
seed: seed,
};
const reply3 = await engine.chat.completions.create(request3);
const response3 = reply3.choices[0].message.content;
messages.push({ role: "assistant", content: response3 });
console.log(reply3.usage);
console.log("Response 3: " + response3);
// {"name": "send_message", "parameters": {"recipient": "Tom", "content": "The current temperature in Paris is 22.5°C."}}
// 5. Call API, which has no return value, so simply prompt model again
const tool_response2 = `{"output": None}`;
messages.push({ role: "tool", content: tool_response2, tool_call_id: "1" });
const request4: webllm.ChatCompletionRequest = {
stream: false, // works with either streaming or non-streaming; code below assumes non-streaming
messages: messages,
seed: seed,
};
const reply4 = await engine.chat.completions.create(request4);
const response4 = reply4.choices[0].message.content;
console.log(reply4.usage);
console.log("Response 4: " + response4);
// The message has been sent to Tom.
}
// Pick one to run
// hermes2_example();
llama3_1_example();
================================================
FILE: examples/function-calling/function-calling-openai/README.md
================================================
### Demos - Function calling
Run `npm install` first, followed by `npm start`.
Note if you would like to hack WebLLM core package,
you can change web-llm dependencies as `"file:../../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/function-calling/function-calling-openai/package.json
================================================
{
"name": "openai-api",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/function_calling_openai.html --port 8888",
"build": "parcel build src/function_calling_openai.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/function-calling/function-calling-openai/src/function_calling_openai.html
================================================
WebLLM Test Page
Open console to see output
================================================
FILE: examples/function-calling/function-calling-openai/src/function_calling_openai.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
async function main() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
const selectedModel = "Hermes-2-Pro-Llama-3-8B-q4f16_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{ initProgressCallback: initProgressCallback },
);
const tools: Array = [
{
type: "function",
function: {
name: "get_current_weather",
description: "Get the current weather in a given location",
parameters: {
type: "object",
properties: {
location: {
type: "string",
description: "The city and state, e.g. San Francisco, CA",
},
unit: { type: "string", enum: ["celsius", "fahrenheit"] },
},
required: ["location"],
},
},
},
];
const request: webllm.ChatCompletionRequest = {
stream: true, // works with stream as well, where the last chunk returns tool_calls
stream_options: { include_usage: true },
messages: [
{
role: "user",
content:
"What is the current weather in celsius in Pittsburgh and Tokyo?",
},
],
tool_choice: "auto",
tools: tools,
};
if (!request.stream) {
const reply0 = await engine.chat.completions.create(request);
console.log(reply0.choices[0]);
console.log(reply0.usage);
} else {
// If streaming, the last chunk returns tool calls
const asyncChunkGenerator = await engine.chat.completions.create(request);
let message = "";
let lastChunk: webllm.ChatCompletionChunk | undefined;
let usageChunk: webllm.ChatCompletionChunk | undefined;
for await (const chunk of asyncChunkGenerator) {
console.log(chunk);
message += chunk.choices[0]?.delta?.content || "";
setLabel("generate-label", message);
if (!chunk.usage) {
lastChunk = chunk;
}
usageChunk = chunk;
}
console.log(lastChunk!.choices[0].delta);
console.log(usageChunk!.usage);
}
}
main();
================================================
FILE: examples/get-started/README.md
================================================
# WebLLM Get Started App
This folder provides a minimum demo to show WebLLM API in a webapp setting.
To try it out, you can do the following steps under this folder
```bash
npm install
npm start
```
Note if you would like to hack WebLLM core package.
You can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/get-started/package.json
================================================
{
"name": "get-started",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/get_started.html --port 8888",
"build": "parcel build src/get_started.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/get-started/src/get_started.html
================================================
WebLLM Test Page
Open console to see output
Prompt
Response
================================================
FILE: examples/get-started/src/get_started.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
async function main() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
// Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts`
const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{
initProgressCallback: initProgressCallback,
logLevel: "INFO", // specify the log level
},
// customize kv cache, use either context_window_size or sliding_window_size (with attention sink)
{
context_window_size: 2048,
// sliding_window_size: 1024,
// attention_sink_size: 4,
},
);
// Option 2: Specify your own model other than the prebuilt ones
// const appConfig: webllm.AppConfig = {
// model_list: [
// {
// model: "https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f32_1-MLC",
// model_id: "Llama-3.1-8B-Instruct-q4f32_1-MLC",
// model_lib:
// webllm.modelLibURLPrefix +
// webllm.modelVersion +
// "/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm",
// overrides: {
// context_window_size: 2048,
// },
// },
// ],
// };
// const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
// selectedModel,
// { appConfig: appConfig, initProgressCallback: initProgressCallback },
// );
// Option 3: Instantiate MLCEngine() and call reload() separately
// const engine: webllm.MLCEngineInterface = new webllm.MLCEngine({
// appConfig: appConfig, // if do not specify, we use webllm.prebuiltAppConfig
// initProgressCallback: initProgressCallback,
// });
// await engine.reload(selectedModel);
const reply0 = await engine.chat.completions.create({
messages: [{ role: "user", content: "List three US states." }],
// below configurations are all optional
n: 3,
temperature: 1.5,
max_tokens: 256,
// 46510 and 7188 are "California", and 8421 and 51325 are "Texas" in Llama-3.1-8B-Instruct
// So we would have a higher chance of seeing the latter two, but never the first in the answer
logit_bias: {
"46510": -100,
"7188": -100,
"8421": 5,
"51325": 5,
},
logprobs: true,
top_logprobs: 2,
});
console.log(reply0);
console.log(reply0.usage);
// To change model, either create a new engine via `CreateMLCEngine()`, or call `engine.reload(modelId)`
}
main();
================================================
FILE: examples/get-started-latency-breakdown/README.md
================================================
# WebLLM Get Started App
This folder provides a minimum demo to show WebLLM API in a webapp setting with
collection of latency statistics for individual token sampling steps.
To try it out, you can do the following steps under this folder
```bash
npm install
npm start
```
Note if you would like to hack WebLLM core package.
You can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/get-started-latency-breakdown/package.json
================================================
{
"name": "get-started-latency-breakdown",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/get_started_latency_breakdown.html --port 8888",
"build": "parcel build src/get_started_latency_breakdown.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/get-started-latency-breakdown/src/get_started_latency_breakdown.html
================================================
WebLLM Test Page
Open console to see output
Prompt
Response
================================================
FILE: examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
type LatencyBreakdown = {
logitProcessorTime: number[];
logitBiasTime: number[];
penaltyTime: number[];
sampleTime: number[];
totalTime: number[];
grammarBitmaskTime: number[];
};
function computeStats(
latency_breakdown: LatencyBreakdown,
): Record {
function _computeStats(arr: number[]) {
if (!arr.length) return undefined;
const sorted = [...arr].sort((a, b) => a - b);
const sum = arr.reduce((a, b) => a + b, 0);
const avg = sum / arr.length;
const min = sorted[0];
const max = sorted[sorted.length - 1];
const p99 = sorted[Math.floor(0.99 * (sorted.length - 1))];
return { avg, min, max, p99 };
}
const latencyStats: Record = {};
for (const key of Object.keys(latency_breakdown)) {
const arr = (latency_breakdown as any)[key];
if (Array.isArray(arr) && arr.length > 0) {
latencyStats[key] = _computeStats(arr);
}
}
return latencyStats;
}
async function main() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
// Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts`
const selectedModel = "Qwen3-0.6B-q0f32-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{
initProgressCallback: initProgressCallback,
logLevel: "INFO", // specify the log level
},
// customize kv cache, use either context_window_size or sliding_window_size (with attention sink)
{
context_window_size: 2048,
// sliding_window_size: 1024,
// attention_sink_size: 4,
},
);
const latencyBreakdown: LatencyBreakdown = {
logitProcessorTime: [],
logitBiasTime: [],
penaltyTime: [],
sampleTime: [],
totalTime: [],
grammarBitmaskTime: [],
};
const decodeTokensPerS: number[] = [];
const completionTokens: number[] = [];
const e2eLatencyS: number[] = [];
const timePerOutputTokenS: number[] = [];
const numTrials = 20;
for (let i = 0; i < numTrials; i++) {
console.log(`Trial ${i + 1} / ${numTrials}`);
const reply0 = await engine.chat.completions.create({
messages: [{ role: "user", content: "List twenty US states." }],
// below configurations are all optional
n: 1,
temperature: 0,
max_tokens: 2048,
// 46510 and 7188 are "California", and 8421 and 51325 are "Texas" in Llama-3.1-8B-Instruct
// So we would have a higher chance of seeing the latter two, but never the first in the answer
// logit_bias: {
// "46510": -100,
// "7188": -100,
// "8421": 5,
// "41325": 5,
// },
top_p: 0.8,
logprobs: true,
top_logprobs: 2,
frequency_penalty: 1.2,
presence_penalty: 1.0,
repetition_penalty: 1.1,
});
const logitProcessorTime =
reply0.usage?.extra.latencyBreakdown?.logitProcessorTime;
const logitBiasTime = reply0.usage?.extra.latencyBreakdown?.logitBiasTime;
const penaltyTime = reply0.usage?.extra.latencyBreakdown?.penaltyTime;
const sampleTime = reply0.usage?.extra.latencyBreakdown?.sampleTime;
const totalTime = reply0.usage?.extra.latencyBreakdown?.totalTime;
const grammarBitmaskTime =
reply0.usage?.extra.latencyBreakdown?.grammarBitmaskTime;
latencyBreakdown.logitProcessorTime.push(...(logitProcessorTime || []));
latencyBreakdown.logitBiasTime.push(...(logitBiasTime || []));
latencyBreakdown.penaltyTime.push(...(penaltyTime || []));
latencyBreakdown.sampleTime.push(...(sampleTime || []));
latencyBreakdown.totalTime.push(...(totalTime || []));
latencyBreakdown.grammarBitmaskTime.push(...(grammarBitmaskTime || []));
decodeTokensPerS.push(reply0.usage?.extra.decode_tokens_per_s || 0);
e2eLatencyS.push(reply0.usage?.extra.e2e_latency_s || 0);
timePerOutputTokenS.push(reply0.usage?.extra.time_per_output_token_s || 0);
completionTokens.push(reply0.usage?.completion_tokens || 0);
}
const latencyStats: { [key: string]: number } =
computeStats(latencyBreakdown);
console.log("Latency stats: ", latencyStats);
console.log("Decode tokens per second: ", decodeTokensPerS);
console.log("Completion tokens: ", completionTokens);
console.log("E2E latency (s): ", e2eLatencyS);
console.log("Time per output token (s): ", timePerOutputTokenS);
// To change model, either create a new engine via `CreateMLCEngine()`, or call `engine.reload(modelId)`
}
main();
================================================
FILE: examples/get-started-web-worker/README.md
================================================
# WebLLM Get Started with WebWorker
This folder provides a minimum demo to show WebLLM API using
[WebWorker](https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Using_web_workers).
The main benefit of web worker is that all ML workloads runs on a separate thread as a result
will less likely block the UI.
To try it out, you can do the following steps under this folder
```bash
npm install
npm start
```
Note if you would like to hack WebLLM core package.
You can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/get-started-web-worker/package.json
================================================
{
"name": "get-started-web-worker",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/get_started.html --port 8885",
"build": "parcel build src/get_started.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^6.0.3",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/get-started-web-worker/src/get_started.html
================================================
WebLLM Test Page
Open console to see output
Prompt
Response
================================================
FILE: examples/get-started-web-worker/src/main.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
// There are two demonstrations, pick one to run
/**
* Chat completion (OpenAI style) without streaming, where we get the entire response at once.
*/
async function mainNonStreaming() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
const engine: webllm.MLCEngineInterface =
await webllm.CreateWebWorkerMLCEngine(
new Worker(new URL("./worker.ts", import.meta.url), { type: "module" }),
selectedModel,
{ initProgressCallback: initProgressCallback },
);
const request: webllm.ChatCompletionRequest = {
messages: [
{
role: "system",
content:
"You are a helpful, respectful and honest assistant. " +
"Be as happy as you can when speaking please. ",
},
{ role: "user", content: "Provide me three US states." },
{ role: "assistant", content: "California, New York, Pennsylvania." },
{ role: "user", content: "Two more please!" },
],
n: 3,
temperature: 1.5,
max_tokens: 256,
};
const reply0 = await engine.chat.completions.create(request);
console.log(reply0);
console.log(reply0.usage);
}
/**
* Chat completion (OpenAI style) with streaming, where delta is sent while generating response.
*/
async function mainStreaming() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
const engine: webllm.MLCEngineInterface =
await webllm.CreateWebWorkerMLCEngine(
new Worker(new URL("./worker.ts", import.meta.url), { type: "module" }),
selectedModel,
{ initProgressCallback: initProgressCallback },
);
const request: webllm.ChatCompletionRequest = {
stream: true,
stream_options: { include_usage: true },
messages: [
{
role: "system",
content:
"You are a helpful, respectful and honest assistant. " +
"Be as happy as you can when speaking please. ",
},
{ role: "user", content: "Provide me three US states." },
{ role: "assistant", content: "California, New York, Pennsylvania." },
{ role: "user", content: "Two more please!" },
],
temperature: 1.5,
max_tokens: 256,
};
const asyncChunkGenerator = await engine.chat.completions.create(request);
let message = "";
for await (const chunk of asyncChunkGenerator) {
console.log(chunk);
message += chunk.choices[0]?.delta?.content || "";
setLabel("generate-label", message);
if (chunk.usage) {
console.log(chunk.usage); // only last chunk has usage
}
// engine.interruptGenerate(); // works with interrupt as well
}
console.log("Final message:\n", await engine.getMessage()); // the concatenated message
}
// Run one of the function below
// mainNonStreaming();
mainStreaming();
================================================
FILE: examples/get-started-web-worker/src/worker.ts
================================================
import { WebWorkerMLCEngineHandler } from "@mlc-ai/web-llm";
// Hookup an engine to a worker handler
const handler = new WebWorkerMLCEngineHandler();
self.onmessage = (msg: MessageEvent) => {
handler.onmessage(msg);
};
================================================
FILE: examples/json-mode/README.md
================================================
### OpenAI API Demos
Run `npm install` first, followed by `npm start`.
Note if you would like to hack WebLLM core package,
you can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/json-mode/package.json
================================================
{
"name": "openai-api",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/json_mode.html --port 8888",
"build": "parcel build src/json_mode.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/json-mode/src/json_mode.html
================================================
WebLLM Test Page
Open console to see output.
================================================
FILE: examples/json-mode/src/json_mode.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
async function main() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
// Pick any one of these models to start trying -- most models in WebLLM support grammar
const selectedModel = "Llama-3.2-3B-Instruct-q4f16_1-MLC";
// const selectedModel = "Qwen2.5-1.5B-Instruct-q4f16_1-MLC";
// const selectedModel = "Phi-3.5-mini-instruct-q4f16_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{ initProgressCallback: initProgressCallback },
);
// Note that you'd need to prompt the model to answer in JSON either in
// user's message or the system prompt
const request: webllm.ChatCompletionRequest = {
stream: false, // works with streaming, logprobs, top_logprobs as well
messages: [
{
role: "user",
content: "Write a short JSON file introducing yourself.",
},
],
n: 2,
max_tokens: 128,
response_format: { type: "json_object" } as webllm.ResponseFormat,
};
const reply0 = await engine.chatCompletion(request);
console.log(reply0);
console.log("First reply's last choice:\n" + (await engine.getMessage()));
console.log(reply0.usage);
}
main();
================================================
FILE: examples/json-schema/README.md
================================================
### OpenAI API Demos
Run `npm install` first, followed by `npm start`.
Note if you would like to hack WebLLM core package,
you can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/json-schema/package.json
================================================
{
"name": "openai-api",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/json_schema.html --port 8885",
"build": "parcel build src/json_schema.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/json-schema/src/json_schema.html
================================================
WebLLM Test Page
Open console to see output.
================================================
FILE: examples/json-schema/src/json_schema.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
import { Type, Static } from "@sinclair/typebox";
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
async function simpleStructuredTextExample() {
// There are several options of providing such a schema
// 1. You can directly define a schema in string
const schema1 = `{
"properties": {
"size": {"title": "Size", "type": "integer"},
"is_accepted": {"title": "Is Accepted", "type": "boolean"},
"num": {"title": "Num", "type": "number"}
},
"required": ["size", "is_accepted", "num"],
"title": "Schema", "type": "object"
}`;
// 2. You can use 3rdparty libraries like typebox to create a schema
const T = Type.Object({
size: Type.Integer(),
is_accepted: Type.Boolean(),
num: Type.Number(),
});
type T = Static;
const schema2 = JSON.stringify(T);
console.log(schema2);
// {"type":"object","properties":{"size":{"type":"integer"},"is_accepted":{"type":"boolean"},
// "num":{"type":"number"}},"required":["size","is_accepted","num"]}
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
// Pick any one of these models to start trying -- most models in WebLLM support grammar
// const selectedModel = "Llama-3.2-3B-Instruct-q4f16_1-MLC";
// const selectedModel = "Qwen2.5-1.5B-Instruct-q4f16_1-MLC";
const selectedModel = "Phi-3.5-mini-instruct-q4f16_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{ initProgressCallback: initProgressCallback, logLevel: "INFO" },
);
// Note that you'd need to prompt the model to answer in JSON either in
// user's message or the system prompt
const request: webllm.ChatCompletionRequest = {
stream: false, // works with streaming, logprobs, top_logprobs as well
messages: [
{
role: "user",
content:
"Generate a json containing three fields: an integer field named size, a " +
"boolean field named is_accepted, and a float field named num.",
},
],
max_tokens: 128,
response_format: {
type: "json_object",
schema: schema2,
} as webllm.ResponseFormat,
};
const reply0 = await engine.chatCompletion(request);
console.log(reply0);
console.log("Output:\n" + (await engine.getMessage()));
console.log(reply0.usage);
}
// The json schema and prompt is taken from
// https://github.com/sgl-project/sglang/tree/main?tab=readme-ov-file#json-decoding
async function harryPotterExample() {
const T = Type.Object({
name: Type.String(),
house: Type.Enum({
Gryffindor: "Gryffindor",
Hufflepuff: "Hufflepuff",
Ravenclaw: "Ravenclaw",
Slytherin: "Slytherin",
}),
blood_status: Type.Enum({
"Pure-blood": "Pure-blood",
"Half-blood": "Half-blood",
"Muggle-born": "Muggle-born",
}),
occupation: Type.Enum({
Student: "Student",
Professor: "Professor",
"Ministry of Magic": "Ministry of Magic",
Other: "Other",
}),
wand: Type.Object({
wood: Type.String(),
core: Type.String(),
length: Type.Number(),
}),
alive: Type.Boolean(),
patronus: Type.String(),
});
type T = Static;
const schema = JSON.stringify(T);
console.log(schema);
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
// Pick any one of these models to start trying -- most models in WebLLM support grammar
const selectedModel = "Llama-3.2-3B-Instruct-q4f16_1-MLC";
// const selectedModel = "Qwen2.5-1.5B-Instruct-q4f16_1-MLC";
// const selectedModel = "Phi-3.5-mini-instruct-q4f16_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{ initProgressCallback: initProgressCallback, logLevel: "INFO" },
);
// Note that you'd need to prompt the model to answer in JSON either in
// user's message or the system prompt
const request: webllm.ChatCompletionRequest = {
stream: false,
messages: [
{
role: "user",
content:
"Hermione Granger is a character in Harry Potter. Please fill in the following information about this character in JSON format." +
"Name is a string of character name. House is one of Gryffindor, Hufflepuff, Ravenclaw, Slytherin. Blood status is one of Pure-blood, Half-blood, Muggle-born. Occupation is one of Student, Professor, Ministry of Magic, Other. Wand is an object with wood, core, and length. Alive is a boolean. Patronus is a string.",
},
],
max_tokens: 128,
response_format: {
type: "json_object",
schema: schema,
} as webllm.ResponseFormat,
};
const reply = await engine.chatCompletion(request);
console.log(reply);
console.log("Output:\n" + (await engine.getMessage()));
console.log(reply.usage);
console.log(reply.usage!.extra);
}
async function functionCallingExample() {
const T = Type.Object({
tool_calls: Type.Array(
Type.Object({
arguments: Type.Any(),
name: Type.String(),
}),
),
});
type T = Static;
const schema = JSON.stringify(T);
console.log(schema);
const tools: Array = [
{
type: "function",
function: {
name: "get_current_weather",
description: "Get the current weather in a given location",
parameters: {
type: "object",
properties: {
location: {
type: "string",
description: "The city and state, e.g. San Francisco, CA",
},
unit: { type: "string", enum: ["celsius", "fahrenheit"] },
},
required: ["location"],
},
},
},
];
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
const selectedModel = "Hermes-2-Pro-Llama-3-8B-q4f16_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{
initProgressCallback: initProgressCallback,
},
);
const request: webllm.ChatCompletionRequest = {
stream: false,
messages: [
{
role: "system",
content: `You are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: ${JSON.stringify(
tools,
)} . Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
If you plan to continue with analysis, always call another function.
Return a valid json object (using double quotes) in the following schema: ${JSON.stringify(
schema,
)}.`,
},
{
role: "user",
content:
"What is the current weather in celsius in Pittsburgh and Tokyo?",
},
],
response_format: {
type: "json_object",
schema: schema,
} as webllm.ResponseFormat,
};
const reply = await engine.chat.completions.create(request);
console.log(reply.choices[0].message.content);
console.log(reply.usage);
}
async function ebnfGrammarExample() {
// You can directly define an EBNFGrammar string with ResponseFormat.grammar
const jsonGrammarStr = String.raw`
root ::= basic_array | basic_object
basic_any ::= basic_number | basic_string | basic_boolean | basic_null | basic_array | basic_object
basic_integer ::= ("0" | "-"? [1-9] [0-9]*) ".0"?
basic_number ::= ("0" | "-"? [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?
basic_string ::= (([\"] basic_string_1 [\"]))
basic_string_1 ::= "" | [^"\\\x00-\x1F] basic_string_1 | "\\" escape basic_string_1
escape ::= ["\\/bfnrt] | "u" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9]
basic_boolean ::= "true" | "false"
basic_null ::= "null"
basic_array ::= "[" ("" | ws basic_any (ws "," ws basic_any)*) ws "]"
basic_object ::= "{" ("" | ws basic_string ws ":" ws basic_any ( ws "," ws basic_string ws ":" ws basic_any)*) ws "}"
ws ::= [ \n\t]*
`;
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
// Pick any one of these models to start trying -- most models in WebLLM support grammar
const selectedModel = "Llama-3.2-3B-Instruct-q4f16_1-MLC";
// const selectedModel = "Qwen2.5-1.5B-Instruct-q4f16_1-MLC";
// const selectedModel = "Phi-3.5-mini-instruct-q4f16_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{ initProgressCallback: initProgressCallback, logLevel: "INFO" },
);
// Note that you'd need to prompt the model to answer in JSON either in
// user's message or the system prompt
const request: webllm.ChatCompletionRequest = {
stream: false, // works with streaming, logprobs, top_logprobs as well
messages: [
{
role: "user",
content: "Introduce yourself in JSON",
},
],
max_tokens: 128,
response_format: {
type: "grammar",
grammar: jsonGrammarStr,
} as webllm.ResponseFormat,
};
const reply0 = await engine.chatCompletion(request);
console.log(reply0);
console.log("Output:\n" + (await engine.getMessage()));
console.log(reply0.usage);
}
async function main() {
// await simpleStructuredTextExample();
await harryPotterExample();
// await functionCallingExample();
// await ebnfGrammarExample();
}
main();
================================================
FILE: examples/logit-processor/README.md
================================================
# WebLLM Logit Processor and Low-Level API Example
This folder explains the usage of `LogitProcessor`, demonstrating how it can be used to
manipulate the raw logits before sampling the token (e.g. setting certain tokens to `inf` or `-inf`).
We demonstrate how to use it with and without a web worker, which can be toggled with `USE_WEB_WORKER`
in `logit_processor.ts` (see `worker.ts` on how `LogitProcessor` plays a role there).
We also demonstrate the usage of a low-level API `forwardTokenAndSample()`, which, unlike `chat.completions.create()`
that assumes the usage is for autoregressive chatting, here we have more fine-grained control.
See `my_logit_processor.ts` on how to customize your own logit processor. Here we make the logit
of token 0 `100.0` manually, large enough that we should expect to always sample token 0, which
is indeed the case if we observe the console log. We also demonstarte that a LogitProcessor can be
stateful, and the state can also be cleaned with `LogitProcessor.resetState()`.
To try it out, you can do the following steps under this folder
```bash
npm install
npm start
```
Note if you would like to hack WebLLM core package, you can change web-llm dependencies as `"file:../.."`, and follow the build from source instruction in the project to build webllm locally. This option is only recommended if you would like to hack WebLLM core package.
================================================
FILE: examples/logit-processor/package.json
================================================
{
"name": "logit-processor",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/logit_processor.html --port 8885",
"build": "parcel build src/logit_processor.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/logit-processor/src/logit_processor.html
================================================
WebLLM Logit Processor Test Page
Open console to see the effect of your logit processor.
================================================
FILE: examples/logit-processor/src/logit_processor.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
import { MyLogitProcessor } from "./my_logit_processor";
const USE_WEB_WORKER = true; // Toggle this to use Logit Processor without a web worker
const AUTOREGRESS_LIMIT = 32; // How many tokens to generate for this test
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
async function main() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
// Instantiate myLogitProcessor, registering in the logitProcessorRegistry
const myLogitProcessor = new MyLogitProcessor();
const logitProcessorRegistry = new Map();
logitProcessorRegistry.set("phi-2-q4f32_1-MLC", myLogitProcessor);
let engine: webllm.MLCEngineInterface;
// Depending on whether we use a web worker, the code is slightly different
if (USE_WEB_WORKER) {
// see worker.ts on how LogitProcessor plays a role there
engine = await webllm.CreateWebWorkerMLCEngine(
new Worker(new URL("./worker.ts", import.meta.url), { type: "module" }),
"phi-2-q4f32_1-MLC",
{ initProgressCallback: initProgressCallback },
);
} else {
engine = await webllm.CreateMLCEngine("phi-2-q4f32_1-MLC", {
initProgressCallback: initProgressCallback,
logitProcessorRegistry: logitProcessorRegistry,
});
}
// Below we demonstrate the usage of a low-level API `forwardTokensAndSample()`
const prompt: Array = [42];
let nextToken = await engine.forwardTokensAndSample(
prompt,
/*isPrefill=*/ true,
);
console.log(nextToken);
let counter = prompt.length;
while (counter < AUTOREGRESS_LIMIT) {
counter += 1;
nextToken = await engine.forwardTokensAndSample(
[nextToken],
/*isPrefill=*/ false,
);
console.log(nextToken);
}
// By calling `engine.resetChat()`, we triggers MyLogitProcessor.resetState()
engine.resetChat();
counter = prompt.length;
nextToken = await engine.forwardTokensAndSample(prompt, /*isPrefill=*/ true);
console.log(nextToken);
while (counter < AUTOREGRESS_LIMIT) {
counter += 1;
nextToken = await engine.forwardTokensAndSample(
[nextToken],
/*isPrefill=*/ false,
);
console.log(nextToken);
}
// `forwardTokensAndSample()` is made compatible with registering runtime stats.
console.log(await engine.runtimeStatsText());
}
main();
================================================
FILE: examples/logit-processor/src/my_logit_processor.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
// Define LogitProcessor
export class MyLogitProcessor implements webllm.LogitProcessor {
private tokenSequence: Array = [];
processLogits(logits: Float32Array): Float32Array {
logits[0] = 100.0; // should be enough so that we always sample token 0 below
return logits;
}
processSampledToken(token: number): void {
this.tokenSequence.push(token);
console.log("processSampledToken: " + this.tokenSequence.length);
}
resetState(): void {
this.tokenSequence = [];
console.log("resetState");
}
}
================================================
FILE: examples/logit-processor/src/worker.ts
================================================
// Serve the chat workload through web worker
import * as webllm from "@mlc-ai/web-llm";
import { MyLogitProcessor } from "./my_logit_processor";
console.log("Use web worker for logit processor");
const myLogitProcessor = new MyLogitProcessor();
const logitProcessorRegistry = new Map();
logitProcessorRegistry.set("phi-2-q4f32_1-MLC", myLogitProcessor);
const handler = new webllm.WebWorkerMLCEngineHandler();
handler.setLogitProcessorRegistry(logitProcessorRegistry);
self.onmessage = (msg: MessageEvent) => {
handler.onmessage(msg);
};
================================================
FILE: examples/multi-models/README.md
================================================
# WebLLM Get Started App
This folder provides a minimum demo to show WebLLM API in a webapp setting.
To try it out, you can do the following steps under this folder
```bash
npm install
npm start
```
Note if you would like to hack WebLLM core package.
You can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/multi-models/package.json
================================================
{
"name": "get-started",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/multi_models.html --port 8888",
"build": "parcel build src/multi_models.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/multi-models/src/main.ts
================================================
/**
* This example demonstrates loading multiple models in the same engine concurrently.
* sequentialGeneration() shows inference each model one at a time.
* parallelGeneration() shows inference both models at the same time.
* This example uses WebWorkerMLCEngine, but the same idea applies to MLCEngine and
* ServiceWorkerMLCEngine as well.
*/
import * as webllm from "@mlc-ai/web-llm";
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
// Prepare request for each model, same for both methods
const selectedModel1 = "Phi-3.5-mini-instruct-q4f32_1-MLC-1k";
const selectedModel2 = "gemma-2-2b-it-q4f32_1-MLC-1k";
const prompt1 = "Tell me about California in 3 short sentences.";
const prompt2 = "Tell me about New York City in 3 short sentences.";
setLabel("prompt-label-1", `(with model ${selectedModel1})\n` + prompt1);
setLabel("prompt-label-2", `(with model ${selectedModel2})\n` + prompt2);
const request1: webllm.ChatCompletionRequestStreaming = {
stream: true,
stream_options: { include_usage: true },
messages: [{ role: "user", content: prompt1 }],
model: selectedModel1, // without specifying it, error will throw due to ambiguity
max_tokens: 128,
};
const request2: webllm.ChatCompletionRequestStreaming = {
stream: true,
stream_options: { include_usage: true },
messages: [{ role: "user", content: prompt2 }],
model: selectedModel2, // without specifying it, error will throw due to ambiguity
max_tokens: 128,
};
/**
* Chat completion (OpenAI style) with streaming, with two models in the pipeline.
*/
async function sequentialGeneration() {
const engine = await webllm.CreateWebWorkerMLCEngine(
new Worker(new URL("./worker.ts", import.meta.url), { type: "module" }),
[selectedModel1, selectedModel2],
{ initProgressCallback: initProgressCallback },
);
const asyncChunkGenerator1 = await engine.chat.completions.create(request1);
let message1 = "";
for await (const chunk of asyncChunkGenerator1) {
// console.log(chunk);
message1 += chunk.choices[0]?.delta?.content || "";
setLabel("generate-label-1", message1);
if (chunk.usage) {
console.log(chunk.usage); // only last chunk has usage
}
// engine.interruptGenerate(); // works with interrupt as well
}
const asyncChunkGenerator2 = await engine.chat.completions.create(request2);
let message2 = "";
for await (const chunk of asyncChunkGenerator2) {
// console.log(chunk);
message2 += chunk.choices[0]?.delta?.content || "";
setLabel("generate-label-2", message2);
if (chunk.usage) {
console.log(chunk.usage); // only last chunk has usage
}
// engine.interruptGenerate(); // works with interrupt as well
}
// without specifying from which model to get message, error will throw due to ambiguity
console.log("Final message 1:\n", await engine.getMessage(selectedModel1));
console.log("Final message 2:\n", await engine.getMessage(selectedModel2));
}
/**
* Chat completion (OpenAI style) with streaming, with two models in the pipeline.
*/
async function parallelGeneration() {
const engine = await webllm.CreateWebWorkerMLCEngine(
new Worker(new URL("./worker.ts", import.meta.url), { type: "module" }),
[selectedModel1, selectedModel2],
{ initProgressCallback: initProgressCallback },
);
// We can serve the two requests concurrently
async function getModel1Response() {
let message1 = "";
const asyncChunkGenerator1 = await engine.chat.completions.create(request1);
for await (const chunk of asyncChunkGenerator1) {
// console.log(chunk);
message1 += chunk.choices[0]?.delta?.content || "";
setLabel("generate-label-1", message1);
if (chunk.usage) {
console.log(chunk.usage); // only last chunk has usage
}
// engine.interruptGenerate(); // works with interrupt as well
}
}
async function getModel2Response() {
let message2 = "";
const asyncChunkGenerator2 = await engine.chat.completions.create(request2);
for await (const chunk of asyncChunkGenerator2) {
// console.log(chunk);
message2 += chunk.choices[0]?.delta?.content || "";
setLabel("generate-label-2", message2);
if (chunk.usage) {
console.log(chunk.usage); // only last chunk has usage
}
// engine.interruptGenerate(); // works with interrupt as well
}
}
await Promise.all([getModel1Response(), getModel2Response()]);
// Note: concurrent requests to the same model are executed sequentially in FCFS,
// unlike to different models like above
// Fore more, see https://github.com/mlc-ai/web-llm/pull/549
// await Promise.all([getModel1Response(), getModel1Response()]);
// without specifying from which model to get message, error will throw due to ambiguity
console.log("Final message 1:\n", await engine.getMessage(selectedModel1));
console.log("Final message 2:\n", await engine.getMessage(selectedModel2));
}
// Pick one to run
sequentialGeneration();
// parallelGeneration();
================================================
FILE: examples/multi-models/src/multi_models.html
================================================
WebLLM Test Page
Open console to see output
Prompt 1
Response from model 1
Prompt 2
Response from model 2
================================================
FILE: examples/multi-models/src/worker.ts
================================================
import { WebWorkerMLCEngineHandler } from "@mlc-ai/web-llm";
// Hookup an engine to a worker handler
const handler = new WebWorkerMLCEngineHandler();
self.onmessage = (msg: MessageEvent) => {
handler.onmessage(msg);
};
================================================
FILE: examples/multi-round-chat/README.md
================================================
### OpenAI API Demos
Run `npm install` first, followed by `npm start`.
Note if you would like to hack WebLLM core package,
you can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/multi-round-chat/package.json
================================================
{
"name": "openai-api",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/multi_round_chat.html --port 8888",
"build": "parcel build src/multi_round_chat.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/multi-round-chat/src/multi_round_chat.html
================================================
WebLLM Test Page
Open console to see output
================================================
FILE: examples/multi-round-chat/src/multi_round_chat.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
/**
* We demonstrate multiround chatting. Though users are required to maintain chat history, internally
* we compare provided `messages` with the internal chat history. If it matches, we will reuse KVs
* and hence save computation -- essentially an implicit internal optimization.
*/
async function main() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{ initProgressCallback: initProgressCallback },
);
// Round 0
const messages: webllm.ChatCompletionMessageParam[] = [
{
role: "system",
content:
"You are a helpful, respectful and honest assistant. " +
"Be as happy as you can when speaking please. ",
},
{ role: "user", content: "Provide me three US states." },
];
const request0: webllm.ChatCompletionRequest = {
stream: false, // can be streaming, same behavior
messages: messages,
};
const reply0 = await engine.chat.completions.create(request0);
const replyMessage0 = await engine.getMessage();
console.log(reply0);
console.log(replyMessage0);
console.log(reply0.usage);
// Round 1
// Append generated response to messages
messages.push({ role: "assistant", content: replyMessage0 });
// Append new user input
messages.push({ role: "user", content: "Two more please!" });
// Below line would cause an internal reset (clear KV cache, etc.) since the history no longer
// matches the new request
// messages[0].content = "Another system prompt";
const request1: webllm.ChatCompletionRequest = {
stream: false, // can be streaming, same behavior
messages: messages,
};
const reply1 = await engine.chat.completions.create(request1);
const replyMessage1 = await engine.getMessage();
console.log(reply1);
console.log(replyMessage1);
console.log(reply1.usage);
// If we used multiround chat, request1 should only prefill a small number of tokens
const prefillTokens0 = reply0.usage?.prompt_tokens;
const prefillTokens1 = reply1.usage?.prompt_tokens;
console.log("Requset 0 prompt tokens: ", prefillTokens0);
console.log("Requset 1 prompt tokens: ", prefillTokens1);
if (
prefillTokens0 === undefined ||
prefillTokens1 === undefined ||
prefillTokens1 > prefillTokens0
) {
throw Error("Multi-round chat is not triggered as expected.");
}
}
main();
================================================
FILE: examples/next-simple-chat/README.md
================================================
This is a [Next.js](https://nextjs.org/) project using web-llm.
## Getting Started
First, install web-llm from source.
Then, run the development server:
```bash
npm run dev
# or
yarn dev
# or
pnpm dev
```
Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
================================================
FILE: examples/next-simple-chat/next.config.js
================================================
/** @type {import('next').NextConfig} */
const nextConfig = {
reactStrictMode: true,
webpack: (config, { isServer }) => {
// Fixes npm packages that depend on `fs` module
if (!isServer) {
config.resolve.fallback = {
...config.resolve.fallback, // if you miss it, all the other options in fallback, specified
// by next.js will be dropped. Doesn't make much sense, but how it is
fs: false, // the solution
module: false,
perf_hooks: false,
};
}
return config;
},
};
module.exports = nextConfig;
================================================
FILE: examples/next-simple-chat/package.json
================================================
{
"name": "next-simple-chat",
"version": "0.1.0",
"private": true,
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"lint": "next lint"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80",
"@types/node": "20.3.3",
"@types/react": "18.2.14",
"@types/react-dom": "18.2.6",
"autoprefixer": "10.4.14",
"eslint": "8.44.0",
"eslint-config-next": "13.4.7",
"next": "^13.5.6",
"postcss": "8.4.24",
"react": "18.2.0",
"react-dom": "18.2.0",
"tailwindcss": "3.3.2",
"typescript": "5.1.6"
}
}
================================================
FILE: examples/next-simple-chat/postcss.config.js
================================================
module.exports = {
plugins: {
tailwindcss: {},
autoprefixer: {},
},
};
================================================
FILE: examples/next-simple-chat/tailwind.config.js
================================================
/** @type {import('tailwindcss').Config} */
module.exports = {
content: [
"./src/pages/**/*.{js,ts,jsx,tsx,mdx}",
"./src/components/**/*.{js,ts,jsx,tsx,mdx}",
"./src/app/**/*.{js,ts,jsx,tsx,mdx}",
],
theme: {
extend: {
backgroundImage: {
"gradient-radial": "radial-gradient(var(--tw-gradient-stops))",
"gradient-conic":
"conic-gradient(from 180deg at 50% 50%, var(--tw-gradient-stops))",
},
},
},
plugins: [],
};
================================================
FILE: examples/next-simple-chat/tsconfig.json
================================================
{
"compilerOptions": {
"target": "es5",
"lib": ["dom", "dom.iterable", "esnext"],
"allowJs": true,
"skipLibCheck": true,
"strict": true,
"forceConsistentCasingInFileNames": true,
"noEmit": true,
"esModuleInterop": true,
"module": "esnext",
"moduleResolution": "node",
"resolveJsonModule": true,
"isolatedModules": true,
"jsx": "preserve",
"incremental": true,
"paths": {
"~/*": ["./src/*"]
}
},
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx"],
"exclude": ["node_modules"]
}
================================================
FILE: examples/next-simple-chat/src/pages/_app.tsx
================================================
import "~/styles/globals.css";
import type { AppProps } from "next/app";
export default function App({ Component, pageProps }: AppProps) {
return ;
}
================================================
FILE: examples/next-simple-chat/src/pages/_document.tsx
================================================
import { Html, Head, Main, NextScript } from "next/document";
export default function Document() {
return (
);
}
================================================
FILE: examples/next-simple-chat/src/pages/index.tsx
================================================
import Head from "next/head";
import ChatComponent from "~/utils/chat_component";
import { Inter } from "next/font/google";
const inter = Inter({ subsets: ["latin"] });
export default function Home() {
return (
<>
Example App
>
);
}
================================================
FILE: examples/next-simple-chat/src/pages/api/hello.ts
================================================
// Next.js API route support: https://nextjs.org/docs/api-routes/introduction
import type { NextApiRequest, NextApiResponse } from "next";
type Data = {
name: string;
};
export default function handler(
req: NextApiRequest,
res: NextApiResponse,
) {
res.status(200).json({ name: "John Doe" });
}
================================================
FILE: examples/next-simple-chat/src/styles/globals.css
================================================
@tailwind base;
@tailwind components;
@tailwind utilities;
:root {
--foreground-rgb: 0, 0, 0;
--background-start-rgb: 214, 219, 220;
--background-end-rgb: 255, 255, 255;
}
@media (prefers-color-scheme: dark) {
:root {
--foreground-rgb: 255, 255, 255;
--background-start-rgb: 0, 0, 0;
--background-end-rgb: 0, 0, 0;
}
}
body {
color: rgb(var(--foreground-rgb));
background: linear-gradient(
to bottom,
transparent,
rgb(var(--background-end-rgb))
)
rgb(var(--background-start-rgb));
}
a {
color: inherit;
text-decoration: none;
}
* {
box-sizing: border-box;
}
chatui-chat {
height: 100;
}
.chatui {
display: flex;
flex-flow: column wrap;
justify-content: space-between;
width: 100%;
max-width: 867px;
margin: 25px 10px;
height: 600px;
border: 2px solid #ddd;
border-radius: 5px;
box-shadow: 0 15px 15px -5px rgba(0, 0, 0, 0.2);
}
s .chatui-header {
display: flex;
justify-content: space-between;
padding: 10px;
border-bottom: 2px solid #ddd;
background: #eee;
color: #666;
}
.chatui-chat {
flex: 1;
overflow-y: auto;
padding: 10px;
}
.chatui-chat::-webkit-scrollbar {
width: 6px;
}
.chatui-chat::-webkit-scrollbar-track {
background: #ddd;
}
.chatui-chat::-webkit-scrollbar-thumb {
background: #bdbdbd;
}
.msg {
display: flex;
align-items: flex-end;
margin-bottom: 10px;
}
.msg:last-of-type {
margin: 0;
}
.msg-bubble {
max-width: 450px;
padding: 15px;
border-radius: 15px;
background: #ececec;
}
.left-msg .msg-bubble {
border-bottom-left-radius: 0;
}
.error-msg .msg-bubble {
border-bottom-left-radius: 0;
color: #f15959;
}
.init-msg .msg-bubble {
border-bottom-left-radius: 0;
}
.right-msg {
flex-direction: row-reverse;
}
.right-msg .msg-bubble {
background: #579ffb;
color: #fff;
border-bottom-right-radius: 0;
}
.chatui-inputarea {
display: flex;
padding: 10px;
border-top: 2px solid #ddd;
background: #eee;
}
.chatui-inputarea * {
padding: 10px;
border: none;
border-radius: 3px;
font-size: 1em;
}
.chatui-input {
flex: 1;
background: #ddd;
}
.chatui-btn {
margin-left: 10px;
background: #579ffb;
color: #fff;
font-weight: bold;
cursor: pointer;
padding: 10px;
}
.chatui-btn:hover {
background: #577bfb;
}
.chatui-chat {
background-color: #fcfcfe;
}
================================================
FILE: examples/next-simple-chat/src/utils/chat_component.tsx
================================================
import { useState } from "react";
import { MLCEngine } from "@mlc-ai/web-llm";
import ChatUI from "~/utils/chat_ui";
const ChatComponent = () => {
const [messages, setMessages] = useState<{ kind: string; text: string }[]>(
[],
);
const [prompt, setPrompt] = useState("");
const [runtimeStats, setRuntimeStats] = useState("");
const [chat_ui] = useState(new ChatUI(new MLCEngine()));
const updateMessage = (kind: string, text: string, append: boolean) => {
if (kind == "init") {
text = "[System Initalize] " + text;
}
const msgCopy = [...messages];
if (msgCopy.length == 0 || append) {
setMessages([...msgCopy, { kind, text }]);
} else {
msgCopy[msgCopy.length - 1] = { kind, text };
setMessages([...msgCopy]);
}
};
return (
);
};
export default ChatComponent;
================================================
FILE: examples/next-simple-chat/src/utils/chat_ui.ts
================================================
import {
MLCEngineInterface,
ChatCompletionMessageParam,
CompletionUsage,
} from "@mlc-ai/web-llm";
export default class ChatUI {
private engine: MLCEngineInterface;
private chatLoaded = false;
private requestInProgress = false;
// We use a request chain to ensure that
// all requests send to chat are sequentialized
private chatRequestChain: Promise = Promise.resolve();
private chatHistory: ChatCompletionMessageParam[] = [];
constructor(engine: MLCEngineInterface) {
this.engine = engine;
}
/**
* Push a task to the execution queue.
*
* @param task The task to be executed;
*/
private pushTask(task: () => Promise) {
const lastEvent = this.chatRequestChain;
this.chatRequestChain = lastEvent.then(task);
}
// Event handlers
// all event handler pushes the tasks to a queue
// that get executed sequentially
// the tasks previous tasks, which causes them to early stop
// can be interrupted by chat.interruptGenerate
async onGenerate(
prompt: string,
messageUpdate: (kind: string, text: string, append: boolean) => void,
setRuntimeStats: (runtimeStats: string) => void,
) {
if (this.requestInProgress) {
return;
}
this.pushTask(async () => {
await this.asyncGenerate(prompt, messageUpdate, setRuntimeStats);
});
return this.chatRequestChain;
}
async onReset(clearMessages: () => void) {
if (this.requestInProgress) {
// interrupt previous generation if any
this.engine.interruptGenerate();
}
this.chatHistory = [];
// try reset after previous requests finishes
this.pushTask(async () => {
await this.engine.resetChat();
clearMessages();
});
return this.chatRequestChain;
}
async asyncInitChat(
messageUpdate: (kind: string, text: string, append: boolean) => void,
) {
if (this.chatLoaded) return;
this.requestInProgress = true;
messageUpdate("init", "", true);
const initProgressCallback = (report: { text: string }) => {
messageUpdate("init", report.text, false);
};
this.engine.setInitProgressCallback(initProgressCallback);
try {
const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
// const selectedModel = "TinyLlama-1.1B-Chat-v0.4-q4f16_1-MLC-1k";
await this.engine.reload(selectedModel);
} catch (err: unknown) {
messageUpdate("error", "Init error, " + (err?.toString() ?? ""), true);
console.log(err);
await this.unloadChat();
this.requestInProgress = false;
return;
}
this.requestInProgress = false;
this.chatLoaded = true;
}
private async unloadChat() {
await this.engine.unload();
this.chatLoaded = false;
}
/**
* Run generate
*/
private async asyncGenerate(
prompt: string,
messageUpdate: (kind: string, text: string, append: boolean) => void,
setRuntimeStats: (runtimeStats: string) => void,
) {
await this.asyncInitChat(messageUpdate);
this.requestInProgress = true;
// const prompt = this.uiChatInput.value;
if (prompt == "") {
this.requestInProgress = false;
return;
}
messageUpdate("right", prompt, true);
// this.uiChatInput.value = "";
// this.uiChatInput.setAttribute("placeholder", "Generating...");
messageUpdate("left", "", true);
try {
this.chatHistory.push({ role: "user", content: prompt });
let curMessage = "";
let usage: CompletionUsage | undefined = undefined;
const completion = await this.engine.chat.completions.create({
stream: true,
messages: this.chatHistory,
stream_options: { include_usage: true },
});
for await (const chunk of completion) {
const curDelta = chunk.choices[0]?.delta.content;
if (curDelta) {
curMessage += curDelta;
}
messageUpdate("left", curMessage, false);
if (chunk.usage) {
usage = chunk.usage;
}
}
const output = await this.engine.getMessage();
this.chatHistory.push({ role: "assistant", content: output });
messageUpdate("left", output, false);
if (usage) {
const runtimeStats =
`prompt_tokens: ${usage.prompt_tokens}, ` +
`completion_tokens: ${usage.completion_tokens}, ` +
`prefill: ${usage.extra.prefill_tokens_per_s.toFixed(4)} tokens/sec, ` +
`decoding: ${usage.extra.decode_tokens_per_s.toFixed(4)} tokens/sec`;
setRuntimeStats(runtimeStats);
}
} catch (err: unknown) {
messageUpdate(
"error",
"Generate error, " + (err?.toString() ?? ""),
true,
);
console.log(err);
await this.unloadChat();
}
this.requestInProgress = false;
}
}
================================================
FILE: examples/qwen3/README.md
================================================
### OpenAI API Demos w/ Qwen3
Run `npm install` first, followed by `npm start`.
Note if you would like to hack WebLLM core package,
you can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/qwen3/package.json
================================================
{
"name": "qwen3_example",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/qwen3_example.html --port 8883",
"build": "parcel build src/qwen3_example.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/qwen3/src/qwen3_example.html
================================================
WebLLM Test Page
Open console to see output
Response
================================================
FILE: examples/qwen3/src/qwen3_example.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
// Helper method to stream responses from the engine
async function streamResponse(
engine: webllm.MLCEngineInterface,
request: webllm.ChatCompletionRequestStreaming,
): Promise {
console.log("Requesting chat completion with request:", request);
const asyncChunkGenerator = await engine.chat.completions.create(request);
let message = "";
for await (const chunk of asyncChunkGenerator) {
message += chunk.choices[0]?.delta?.content || "";
setLabel("generate-label", message);
if (chunk.usage) {
console.log(chunk.usage); // only last chunk has usage
}
// engine.interruptGenerate(); // works with interrupt as well
}
console.log("Final message:\n", await engine.getMessage()); // the concatenated message
}
/**
* We demonstrate how Qwen3's best practices can be followed in WebLLM. For more, see
* https://huggingface.co/Qwen/Qwen3-8B#best-practices.
*/
async function main() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
const selectedModel = "Qwen3-4B-q4f16_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{ initProgressCallback: initProgressCallback },
);
/**
* 1. Default behavior: enable thinking
*/
let request: webllm.ChatCompletionRequest = {
stream: true,
stream_options: { include_usage: true },
messages: [
{
role: "user",
content: "How many r's are there in the word strawberry?",
},
],
// Specifying `enable_thinking` is optional, as it defaults to think.
// extra_body: {
// enable_thinking: true,
// }
};
await streamResponse(engine, request);
/**
* 2. Disable thinking with `enable_thinking: false`.
*/
request = {
stream: true,
stream_options: { include_usage: true },
messages: [
{
role: "user",
content: "How many r's are there in the word strawberry?",
},
],
extra_body: {
enable_thinking: false,
},
};
await streamResponse(engine, request);
/**
* 3. Disable thinking with soft switch /no_think
* or enable thinking with soft switch /think.
* Using soft switch: "When enable_thinking=True, regardless of whether the user
* uses /think or /no_think, the model will always output a block wrapped in
* .... However, the content inside this block may be empty if
* thinking is disabled. When enable_thinking=False, the soft switches are not
* valid. Regardless of any /think or /no_think tags input by the user, the
* model will not generate think content and will not include a ... block.
*/
request = {
stream: true,
stream_options: { include_usage: true },
messages: [
{
role: "user",
content: "How many r's are there in the word strawberry? /no_think",
// content: "How many r's are there in the word strawberry? /think",
},
],
};
await streamResponse(engine, request);
/**
* 4. For multi-turn messages, it is recommended to
* parse out the thinking content in the history
* messages as described in the Best Practices section.
*/
const history: webllm.ChatCompletionMessageParam[] = [
{
role: "user",
content: "How many r's are there in the word strawberry? /think",
},
{
role: "assistant",
content:
"Dummy thinking content here...\n\nThe answer is 3.",
},
];
// Preprocess history to remove thinking content
const preprocessedHistory = history.map((msg) => {
if (msg.role === "assistant") {
// Remove ... block from assistant messages that is at the start
// and may contain two \n\n line breaks.
const thinkRegex = /.*?<\/think>\n?\n?/s; // Match ... with optional \n\n
const contentWithoutThink = msg.content!.replace(thinkRegex, "").trim();
return { ...msg, content: contentWithoutThink };
}
return msg; // User messages remain unchanged
});
console.log("Preprocessed history:", preprocessedHistory);
// Now use the preprocessed history in the request
const newMessage: webllm.ChatCompletionMessageParam = {
role: "user",
content: "What about blueberries?",
};
request = {
stream: true,
stream_options: { include_usage: true },
messages: [...preprocessedHistory, newMessage],
};
await streamResponse(engine, request);
}
main();
================================================
FILE: examples/seed-to-reproduce/README.md
================================================
### OpenAI API Demos
Run `npm install` first, followed by `npm start`.
Note if you would like to hack WebLLM core package,
you can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/seed-to-reproduce/package.json
================================================
{
"name": "seed-to-reproduce",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/seed.html --port 8888",
"build": "parcel build src/seed.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/seed-to-reproduce/src/seed.html
================================================
WebLLM Test Page
Open console to see output. We make two generations with same seed, we
should expect them to be the same.
================================================
FILE: examples/seed-to-reproduce/src/seed.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
/**
* We domnstrate the effect of seeding. The prompt is about writing a poem and we use a high
* `temperature`, making the sampling distribution supposedly more random. However, we demonstrate
* that with seeding, we should see the exact same result being generated across two trials.
* With `n > 1`, all choices should also be exactly the same.
*/
async function main() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{ initProgressCallback: initProgressCallback },
);
const request: webllm.ChatCompletionRequest = {
stream: false, // works with streaming as well
messages: [
{ role: "user", content: "Write a creative Haiku about Pittsburgh" },
],
n: 3,
temperature: 1.2, // high temperature gives much more random results
max_tokens: 128, // To save time; enough to demonstrate the effect
seed: 42,
};
const reply0 = await engine.chat.completions.create(request);
console.log(reply0);
console.log("First reply's last choice:\n" + (await engine.getMessage()));
console.log(reply0.usage);
const reply1 = await engine.chat.completions.create(request);
console.log(reply1);
console.log("Second reply's last choice:\n" + (await engine.getMessage()));
// Rigorously check the generation results of each choice for the two requests
for (const choice0 of reply0.choices) {
const id = choice0.index;
const choice1 = reply1.choices[id];
if (choice0.message.content !== choice1.message.content) {
throw Error(
"Chocie " +
id +
" of the two generations are different despite seeding",
);
}
}
console.log(reply1.usage);
}
// Run one of the functions
main();
================================================
FILE: examples/service-worker/README.md
================================================
# WebLLM Service Worker Example
This example shows how we can create a page with Web-LLM running in service worker.
```bash
npm install
npm run build
```
================================================
FILE: examples/service-worker/package.json
================================================
{
"name": "web-llm-service-worker",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "rm -rf .parcel-cache && parcel src/index.html --port 3000",
"build": "rm -rf .parcel-cache && parcel build src/index.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^6.0.3",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/service-worker/src/index.html
================================================
WebLLM Test Page
Open console to see output
Prompt
Response
================================================
FILE: examples/service-worker/src/main.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
const registerServiceWorker = async () => {
if ("serviceWorker" in navigator) {
try {
const registration = await navigator.serviceWorker.register(
new URL("sw.ts", import.meta.url),
{ type: "module" },
);
if (registration.installing) {
console.log("Service worker installing");
} else if (registration.waiting) {
console.log("Service worker installed");
} else if (registration.active) {
console.log("Service worker active");
}
} catch (error) {
console.error(`Registration failed with ${error}`);
}
}
};
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
// There are two demonstrations, pick one to run
/**
* Chat completion (OpenAI style) without streaming, where we get the entire response at once.
*/
async function mainNonStreaming() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
const engine: webllm.MLCEngineInterface =
await webllm.CreateServiceWorkerMLCEngine(selectedModel, {
initProgressCallback: initProgressCallback,
});
const request: webllm.ChatCompletionRequest = {
messages: [
{
role: "system",
content:
"You are a helpful, respectful and honest assistant. " +
"Be as happy as you can when speaking please. ",
},
{ role: "user", content: "Provide me three US states." },
{ role: "assistant", content: "California, New York, Pennsylvania." },
{ role: "user", content: "Two more please!" },
],
n: 3,
temperature: 1.5,
max_tokens: 256,
};
const reply0 = await engine.chat.completions.create(request);
console.log(reply0);
setLabel("generate-label", reply0.choices[0].message.content || "");
console.log(reply0.usage);
}
/**
* Chat completion (OpenAI style) with streaming, where delta is sent while generating response.
*/
async function mainStreaming() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
const engine: webllm.ServiceWorkerMLCEngine =
await webllm.CreateServiceWorkerMLCEngine(selectedModel, {
initProgressCallback: initProgressCallback,
});
const request: webllm.ChatCompletionRequest = {
stream: true,
stream_options: { include_usage: true },
messages: [
{
role: "system",
content:
"You are a helpful, respectful and honest assistant. " +
"Be as happy as you can when speaking please. ",
},
{ role: "user", content: "Provide me three US states." },
{ role: "assistant", content: "California, New York, Pennsylvania." },
{ role: "user", content: "Two more please!" },
],
temperature: 1.5,
max_tokens: 256,
};
const asyncChunkGenerator = await engine.chat.completions.create(request);
let message = "";
for await (const chunk of asyncChunkGenerator) {
console.log(chunk);
message += chunk.choices[0]?.delta?.content || "";
setLabel("generate-label", message);
if (chunk.usage) {
console.log(chunk.usage); // only last chunk has usage
}
// engine.interruptGenerate(); // works with interrupt as well
}
console.log("Final message:\n", await engine.getMessage()); // the concatenated message
}
registerServiceWorker();
// Run one of the function below
// mainNonStreaming();
mainStreaming();
================================================
FILE: examples/service-worker/src/sw.ts
================================================
import { ServiceWorkerMLCEngineHandler } from "@mlc-ai/web-llm";
let handler: ServiceWorkerMLCEngineHandler;
self.addEventListener("activate", function (event) {
handler = new ServiceWorkerMLCEngineHandler();
console.log("Web-LLM Service Worker Activated");
});
================================================
FILE: examples/simple-chat-js/index.css
================================================
body,
html {
font-family: Arial, sans-serif;
padding: 10px 20px;
}
.download-container {
display: flex;
justify-content: space-between;
margin-bottom: 20px;
}
#download-status {
border: solid 1px black;
box-shadow:
0 10px 15px -3px rgba(0, 0, 0, 0.1),
0 4px 6px -2px rgba(0, 0, 0, 0.05);
padding: 10px;
}
.chat-container {
height: 400px;
width: 100%;
border: 2px solid black;
display: flex;
flex-direction: column;
}
.chat-box {
overflow-y: scroll;
background-color: #c3c3c3;
border: 1px solid #ccc;
padding: 5px;
flex: 1 1;
}
.chat-stats {
background-color: #d3eceb;
flex: 0 0;
padding: 10px;
font-size: 0.75rem;
}
.message-container {
width: 100%;
display: flex;
}
.message {
padding: 10px;
margin: 10px 0;
border-radius: 10px;
width: fit-content;
}
.message-container.user {
justify-content: end;
}
.message-container.assistant {
justify-content: start;
}
.message-container.user .message {
background: #007bff;
color: #fff;
}
.message-container.assistant .message {
background: #f1f0f0;
color: #333;
}
.chat-input-container {
min-height: 40px;
flex: 0 0;
display: flex;
}
#user-input {
width: 70%;
padding: 10px;
border: 1px solid #ccc;
}
button {
width: 25%;
padding: 10px;
border: none;
background-color: #007bff;
color: white;
cursor: pointer;
}
button:disabled {
background-color: lightgray;
cursor: not-allowed;
}
button:hover:not(:disabled) {
background-color: #0056b3;
}
.hidden {
display: none;
}
================================================
FILE: examples/simple-chat-js/index.html
================================================
Simple Chatbot
Step 1: Initialize WebLLM and Download Model
Step 2: Chat
================================================
FILE: examples/simple-chat-js/index.js
================================================
import * as webllm from "https://esm.run/@mlc-ai/web-llm";
/*************** WebLLM logic ***************/
const messages = [
{
content: "You are a helpful AI agent helping users.",
role: "system",
},
];
const availableModels = webllm.prebuiltAppConfig.model_list.map(
(m) => m.model_id,
);
let selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-1k";
// Callback function for initializing progress
function updateEngineInitProgressCallback(report) {
console.log("initialize", report.progress);
document.getElementById("download-status").textContent = report.text;
}
// Create engine instance
const engine = new webllm.MLCEngine();
engine.setInitProgressCallback(updateEngineInitProgressCallback);
async function initializeWebLLMEngine() {
document.getElementById("download-status").classList.remove("hidden");
selectedModel = document.getElementById("model-selection").value;
const config = {
temperature: 1.0,
top_p: 1,
};
await engine.reload(selectedModel, config);
}
async function streamingGenerating(messages, onUpdate, onFinish, onError) {
try {
let curMessage = "";
let usage;
const completion = await engine.chat.completions.create({
stream: true,
messages,
stream_options: { include_usage: true },
});
for await (const chunk of completion) {
const curDelta = chunk.choices[0]?.delta.content;
if (curDelta) {
curMessage += curDelta;
}
if (chunk.usage) {
usage = chunk.usage;
}
onUpdate(curMessage);
}
const finalMessage = await engine.getMessage();
onFinish(finalMessage, usage);
} catch (err) {
onError(err);
}
}
/*************** UI logic ***************/
function onMessageSend() {
const input = document.getElementById("user-input").value.trim();
const message = {
content: input,
role: "user",
};
if (input.length === 0) {
return;
}
document.getElementById("send").disabled = true;
messages.push(message);
appendMessage(message);
document.getElementById("user-input").value = "";
document
.getElementById("user-input")
.setAttribute("placeholder", "Generating...");
const aiMessage = {
content: "typing...",
role: "assistant",
};
appendMessage(aiMessage);
const onFinishGenerating = (finalMessage, usage) => {
updateLastMessage(finalMessage);
document.getElementById("send").disabled = false;
const usageText =
`prompt_tokens: ${usage.prompt_tokens}, ` +
`completion_tokens: ${usage.completion_tokens}, ` +
`prefill: ${usage.extra.prefill_tokens_per_s.toFixed(4)} tokens/sec, ` +
`decoding: ${usage.extra.decode_tokens_per_s.toFixed(4)} tokens/sec`;
document.getElementById("chat-stats").classList.remove("hidden");
document.getElementById("chat-stats").textContent = usageText;
};
streamingGenerating(
messages,
updateLastMessage,
onFinishGenerating,
console.error,
);
}
function appendMessage(message) {
const chatBox = document.getElementById("chat-box");
const container = document.createElement("div");
container.classList.add("message-container");
const newMessage = document.createElement("div");
newMessage.classList.add("message");
newMessage.textContent = message.content;
if (message.role === "user") {
container.classList.add("user");
} else {
container.classList.add("assistant");
}
container.appendChild(newMessage);
chatBox.appendChild(container);
chatBox.scrollTop = chatBox.scrollHeight; // Scroll to the latest message
}
function updateLastMessage(content) {
const messageDoms = document
.getElementById("chat-box")
.querySelectorAll(".message");
const lastMessageDom = messageDoms[messageDoms.length - 1];
lastMessageDom.textContent = content;
}
/*************** UI binding ***************/
availableModels.forEach((modelId) => {
const option = document.createElement("option");
option.value = modelId;
option.textContent = modelId;
document.getElementById("model-selection").appendChild(option);
});
document.getElementById("model-selection").value = selectedModel;
document.getElementById("download").addEventListener("click", function () {
initializeWebLLMEngine().then(() => {
document.getElementById("send").disabled = false;
});
});
document.getElementById("send").addEventListener("click", function () {
onMessageSend();
});
================================================
FILE: examples/simple-chat-ts/README.md
================================================
# SimpleChat
This folder provides a complete implementation of a simple
chat app based on WebLLM. To try it out, you can do the following steps
under this folder
```bash
npm install
npm start
```
Note if you would like to hack WebLLM core package.
You can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
Due to the differences in command-line tools between Unix/Linux and Windows systems, special adaptation is necessary for Windows. Unix/Linux systems natively support commands like `cp` for file operations, which are not directly available in Windows. To ensure cross-platform compatibility, we use a Node.js script for file copying in Windows.
### Steps for Windows Users
1. **Create a Node.js Script File**:
- In the `examples\simple-chat` directory, create a file named `copy-config.js`.
- Add the following code to handle file copying:
```javascript
const fs = require("fs");
// Copy file
fs.copyFileSync("src/gh-config.js", "src/app-config.js");
```
2. **Modify `package.json`**:
- In the `scripts` section of your `package.json`, replace Unix-style `cp` commands with our new Node.js script. For example:
```json
"scripts": {
"start": "node copy-config.js && parcel src/llm_chat.html --port 8888",
"mlc-local": "node copy-config.js && parcel src/llm_chat.html --port 8888",
"build": "node copy-config.js && parcel build src/llm_chat.html --dist-dir lib --no-content-hash"
},
```
3. **Run the Application**:
- Save your changes and run `npm start` in CMD or PowerShell to start the application.
================================================
FILE: examples/simple-chat-ts/package.json
================================================
{
"name": "simple-chat",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "cp src/gh-config.js src/app-config.js && parcel src/llm_chat.html --port 8883",
"build": "cp src/gh-config.js src/app-config.js && parcel build src/llm_chat.html --dist-dir lib --no-content-hash"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/simple-chat-ts/src/gh-config.js
================================================
import { prebuiltAppConfig } from "@mlc-ai/web-llm";
export default {
model_list: prebuiltAppConfig.model_list,
use_web_worker: true,
};
================================================
FILE: examples/simple-chat-ts/src/llm_chat.css
================================================
.chatui {
display: flex;
position: relative;
flex-flow: column wrap;
justify-content: space-between;
width: 100%;
max-width: 867px;
margin: 25px 10px;
height: 600px;
border: 2px solid #ddd;
border-radius: 5px;
background-color: #1f2027;
}
.chatui-select-wrapper {
display: flex;
justify-content: center;
background-color: #1f2027;
padding: 10px 0;
}
#chatui-select {
width: 350px;
background-color: #1f2027;
color: white;
border: none;
}
#chatui-select:focus {
outline: none;
}
#chatui-select::-webkit-scrollbar {
display: none;
}
#chatui-select option {
background-color: #1f2027;
color: white;
}
#chatui-select option:hover {
background-color: #474747;
color: white;
}
s .chatui-header {
display: flex;
justify-content: space-between;
padding: 10px;
border-bottom: 2px solid #ddd;
background: #eee;
color: #666;
}
/* Used to remove tiny white lines in android devices; not sure if there is a better way */
*,
*::before,
*::after {
box-sizing: content-box;
}
.chatui-chat {
flex: 1;
overflow-y: auto;
padding: 10px;
background-color: #1f2027;
}
.chatui-chat::-webkit-scrollbar {
width: 6px;
}
.chatui-chat::-webkit-scrollbar-track {
background: #1f2027;
}
.chatui-chat::-webkit-scrollbar-thumb {
background: #888;
}
.chatui-chat::-webkit-scrollbar-thumb:hover {
background: #555;
}
.msg {
display: flex;
align-items: flex-end;
margin-bottom: 10px;
}
.msg:last-of-type {
margin: 0;
}
.msg-bubble {
background-color: #f0f0f0;
border-radius: 8px;
padding: 16px;
margin: 5px auto;
width: calc(100% - 20px);
box-sizing: border-box;
color: black;
border: none;
font-size: medium;
margin-left: auto;
margin-right: auto;
}
.left-msg .msg-bubble {
background-color: #343541;
color: #ececec;
}
.error-msg .msg-bubble {
background-color: #343541;
color: #f15959;
}
.init-msg .msg-bubble {
background-color: #343541;
color: #ececec;
}
.right-msg .msg-bubble {
background-color: #444654;
color: #ececec;
}
.chatui-inputarea {
display: flex;
padding: 10px;
border-top: 2px solid transparent;
background-color: #1f2027;
}
.chatui-inputarea * {
padding: 10px;
border: none;
border-radius: 3px;
font-size: 1em;
color: white;
background: rgba(0, 0, 0, 0.3);
}
.chatui-input {
flex: 1;
background-color: #40414f;
color: white;
}
.chatui-reset-btn {
margin-left: 10px;
background-color: #40414f;
color: #fff;
font-weight: bold;
cursor: pointer;
background-image: url("img/reset.png");
background-repeat: no-repeat;
background-position: center;
width: 40px;
background-repeat: no-repeat;
background-position: center;
background-size: 20px 20px;
}
.chatui-reset-btn:hover {
background-color: #03a33e;
}
.chatui-send-btn {
margin-left: 10px;
background-color: #40414f;
color: #fff;
font-weight: bold;
cursor: pointer;
background-image: url("img/plane.png");
background-repeat: no-repeat;
background-position: center;
width: 40px;
background-repeat: no-repeat;
background-position: center;
background-size: 20px 20px;
}
.chatui-send-btn:hover {
background-color: #03a33e;
}
================================================
FILE: examples/simple-chat-ts/src/llm_chat.html
================================================
================================================
FILE: examples/simple-chat-ts/src/simple_chat.ts
================================================
import appConfig from "./app-config";
import * as webllm from "@mlc-ai/web-llm";
function getElementAndCheck(id: string): HTMLElement {
const element = document.getElementById(id);
if (element == null) {
throw Error("Cannot find element " + id);
}
return element;
}
class ChatUI {
private uiChat: HTMLElement;
private uiChatInput: HTMLInputElement;
private uiChatInfoLabel: HTMLLabelElement;
private engine: webllm.MLCEngineInterface | webllm.WebWorkerMLCEngine;
private config: webllm.AppConfig = appConfig;
private selectedModel: string;
private chatLoaded = false;
private requestInProgress = false;
private chatHistory: webllm.ChatCompletionMessageParam[] = [];
// We use a request chain to ensure that
// all requests send to chat are sequentialized
private chatRequestChain: Promise = Promise.resolve();
/**
* An asynchronous factory constructor since we need to await getMaxStorageBufferBindingSize();
* this is not allowed in a constructor (which cannot be asynchronous).
*/
public static CreateAsync = async (engine: webllm.MLCEngineInterface) => {
const chatUI = new ChatUI();
chatUI.engine = engine;
// get the elements
chatUI.uiChat = getElementAndCheck("chatui-chat");
chatUI.uiChatInput = getElementAndCheck("chatui-input") as HTMLInputElement;
chatUI.uiChatInfoLabel = getElementAndCheck(
"chatui-info-label",
) as HTMLLabelElement;
// register event handlers
getElementAndCheck("chatui-reset-btn").onclick = () => {
chatUI.onReset();
};
getElementAndCheck("chatui-send-btn").onclick = () => {
chatUI.onGenerate();
};
// TODO: find other alternative triggers
getElementAndCheck("chatui-input").onkeypress = (event) => {
if (event.keyCode === 13) {
chatUI.onGenerate();
}
};
// When we detect low maxStorageBufferBindingSize, we assume that the device (e.g. an Android
// phone) can only handle small models and make all other models unselectable. Otherwise, the
// browser may crash. See https://github.com/mlc-ai/web-llm/issues/209.
// Also use GPU vendor to decide whether it is a mobile device (hence with limited resources).
const androidMaxStorageBufferBindingSize = 1 << 27; // 128MB
const mobileVendors = new Set(["qualcomm", "arm"]);
let restrictModels = false;
let maxStorageBufferBindingSize: number;
let gpuVendor: string;
try {
[maxStorageBufferBindingSize, gpuVendor] = await Promise.all([
engine.getMaxStorageBufferBindingSize(),
engine.getGPUVendor(),
]);
} catch (err) {
chatUI.appendMessage("error", "Init error, " + err.toString());
console.log(err.stack);
return;
}
if (
(gpuVendor.length != 0 && mobileVendors.has(gpuVendor)) ||
maxStorageBufferBindingSize <= androidMaxStorageBufferBindingSize
) {
chatUI.appendMessage(
"init",
"Your device seems to have " +
"limited resources, so we restrict the selectable models.",
);
restrictModels = true;
}
// Populate modelSelector
const modelSelector = getElementAndCheck(
"chatui-select",
) as HTMLSelectElement;
for (let i = 0; i < chatUI.config.model_list.length; ++i) {
const item = chatUI.config.model_list[i];
const opt = document.createElement("option");
opt.value = item.model_id;
opt.innerHTML = item.model_id;
opt.selected = i == 0;
if (
(restrictModels &&
(item.low_resource_required === undefined ||
!item.low_resource_required)) ||
(item.buffer_size_required_bytes &&
maxStorageBufferBindingSize < item.buffer_size_required_bytes)
) {
// Either on a low-resource device and not a low-resource model
// Or device's maxStorageBufferBindingSize does not satisfy the model's need (if specified)
const params = new URLSearchParams(location.search);
opt.disabled = !params.has("bypassRestrictions");
opt.selected = false;
}
if (
!modelSelector.lastChild?.textContent?.startsWith(
opt.value.split("-")[0],
)
) {
modelSelector.appendChild(document.createElement("hr"));
}
modelSelector.appendChild(opt);
}
modelSelector.appendChild(document.createElement("hr"));
chatUI.selectedModel = modelSelector.value;
modelSelector.onchange = () => {
chatUI.onSelectChange(modelSelector);
};
return chatUI;
};
/**
* Push a task to the execution queue.
*
* @param task The task to be executed;
*/
private pushTask(task: () => Promise) {
const lastEvent = this.chatRequestChain;
this.chatRequestChain = lastEvent.then(task);
}
// Event handlers
// all event handler pushes the tasks to a queue
// that get executed sequentially
// the tasks previous tasks, which causes them to early stop
// can be interrupted by engine.interruptGenerate
private async onGenerate() {
if (this.requestInProgress) {
return;
}
this.pushTask(async () => {
await this.asyncGenerate();
});
}
private async onSelectChange(modelSelector: HTMLSelectElement) {
if (this.requestInProgress) {
// interrupt previous generation if any
this.engine.interruptGenerate();
}
// try reset after previous requests finishes
this.pushTask(async () => {
await this.engine.resetChat();
this.resetChatHistory();
await this.unloadChat();
this.selectedModel = modelSelector.value;
await this.asyncInitChat();
});
}
private async onReset() {
if (this.requestInProgress) {
// interrupt previous generation if any
this.engine.interruptGenerate();
}
// try reset after previous requests finishes
this.pushTask(async () => {
await this.engine.resetChat();
this.resetChatHistory();
});
}
// Internal helper functions
private appendMessage(kind, text) {
if (kind == "init") {
text = "[System Initalize] " + text;
}
if (this.uiChat === undefined) {
throw Error("cannot find ui chat");
}
const msg = `
${text}
`;
this.uiChat.insertAdjacentHTML("beforeend", msg);
this.uiChat.scrollTo(0, this.uiChat.scrollHeight);
}
// Special care for user input such that we treat it as pure text instead of html
private appendUserMessage(text: string) {
if (this.uiChat === undefined) {
throw Error("cannot find ui chat");
}
const msg = `
`;
this.uiChat.insertAdjacentHTML("beforeend", msg);
// Recurse three times to get `msg-text`
const msgElement = this.uiChat.lastElementChild?.lastElementChild
?.lastElementChild as HTMLElement;
msgElement.insertAdjacentText("beforeend", text);
this.uiChat.scrollTo(0, this.uiChat.scrollHeight);
}
private updateLastMessage(kind, text) {
if (kind == "init") {
text = "[System Initialize] " + text;
}
if (this.uiChat === undefined) {
throw Error("cannot find ui chat");
}
const matches = this.uiChat.getElementsByClassName(`msg ${kind}-msg`);
if (matches.length == 0) throw Error(`${kind} message do not exist`);
const msg = matches[matches.length - 1];
const msgText = msg.getElementsByClassName("msg-text");
if (msgText.length != 1) throw Error("Expect msg-text");
if (msgText[0].innerHTML == text) return;
const list = text.split("\n").map((t) => {
const item = document.createElement("div");
item.textContent = t;
return item;
});
msgText[0].innerHTML = "";
list.forEach((item) => msgText[0].append(item));
this.uiChat.scrollTo(0, this.uiChat.scrollHeight);
}
private resetChatHistory() {
this.chatHistory = [];
const clearTags = ["left", "right", "init", "error"];
for (const tag of clearTags) {
// need to unpack to list so the iterator don't get affected by mutation
const matches = [...this.uiChat.getElementsByClassName(`msg ${tag}-msg`)];
for (const item of matches) {
this.uiChat.removeChild(item);
}
}
if (this.uiChatInfoLabel !== undefined) {
this.uiChatInfoLabel.innerHTML = "";
}
}
private async asyncInitChat() {
if (this.chatLoaded) return;
this.requestInProgress = true;
this.appendMessage("init", "");
const initProgressCallback = (report) => {
this.updateLastMessage("init", report.text);
};
this.engine.setInitProgressCallback(initProgressCallback);
try {
await this.engine.reload(this.selectedModel);
} catch (err) {
this.appendMessage("error", "Init error, " + err.toString());
console.log(err.stack);
this.unloadChat();
this.requestInProgress = false;
return;
}
this.requestInProgress = false;
this.chatLoaded = true;
}
private async unloadChat() {
await this.engine.unload();
this.chatLoaded = false;
}
/**
* Run generate
*/
private async asyncGenerate() {
await this.asyncInitChat();
this.requestInProgress = true;
const prompt = this.uiChatInput.value;
if (prompt == "") {
this.requestInProgress = false;
return;
}
this.appendUserMessage(prompt);
this.uiChatInput.value = "";
this.uiChatInput.setAttribute("placeholder", "Generating...");
this.appendMessage("left", "");
this.chatHistory.push({ role: "user", content: prompt });
try {
let curMessage = "";
let usage: webllm.CompletionUsage | undefined = undefined;
const completion = await this.engine.chat.completions.create({
stream: true,
messages: this.chatHistory,
stream_options: { include_usage: true },
// if model starts with "Qwen3", disable thinking.
extra_body: this.selectedModel.startsWith("Qwen3")
? {
enable_thinking: false,
}
: undefined,
});
// TODO(Charlie): Processing of � requires changes
for await (const chunk of completion) {
const curDelta = chunk.choices[0]?.delta.content;
if (curDelta) {
curMessage += curDelta;
}
this.updateLastMessage("left", curMessage);
if (chunk.usage) {
usage = chunk.usage;
}
}
if (usage) {
this.uiChatInfoLabel.innerHTML =
`prompt_tokens: ${usage.prompt_tokens}, ` +
`completion_tokens: ${usage.completion_tokens}, ` +
`prefill: ${usage.extra.prefill_tokens_per_s.toFixed(4)} tokens/sec, ` +
`decoding: ${usage.extra.decode_tokens_per_s.toFixed(4)} tokens/sec`;
}
const finalMessage = await this.engine.getMessage();
this.updateLastMessage("left", finalMessage); // TODO: Remove this after � issue is fixed
this.chatHistory.push({ role: "assistant", content: finalMessage });
} catch (err) {
this.appendMessage("error", "Generate error, " + err.toString());
console.log(err.stack);
await this.unloadChat();
}
this.uiChatInput.setAttribute("placeholder", "Enter your message...");
this.requestInProgress = false;
}
}
const useWebWorker = appConfig.use_web_worker;
let engine: webllm.MLCEngineInterface;
// Here we do not use `CreateMLCEngine()` but instantiate an engine that is not loaded with model
if (useWebWorker) {
engine = new webllm.WebWorkerMLCEngine(
new Worker(new URL("./worker.ts", import.meta.url), { type: "module" }),
{ appConfig, logLevel: "INFO" },
);
} else {
engine = new webllm.MLCEngine({ appConfig });
}
ChatUI.CreateAsync(engine);
================================================
FILE: examples/simple-chat-ts/src/worker.ts
================================================
// Serve the engine workload through web worker
import { WebWorkerMLCEngineHandler } from "@mlc-ai/web-llm";
const handler = new WebWorkerMLCEngineHandler();
self.onmessage = (msg: MessageEvent) => {
handler.onmessage(msg);
};
================================================
FILE: examples/simple-chat-upload/README.md
================================================
# SimpleChat
This folder provides a complete implementation of a simple
chat app based on WebLLM. To try it out, you can do the following steps
under this folder
```bash
npm install
npm start
```
Note if you would like to hack WebLLM core package.
You can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
Due to the differences in command-line tools between Unix/Linux and Windows systems, special adaptation is necessary for Windows. Unix/Linux systems natively support commands like `cp` for file operations, which are not directly available in Windows. To ensure cross-platform compatibility, we use a Node.js script for file copying in Windows.
### Steps for Windows Users
1. **Create a Node.js Script File**:
- In the `examples\simple-chat` directory, create a file named `copy-config.js`.
- Add the following code to handle file copying:
```javascript
const fs = require("fs");
// Copy file
fs.copyFileSync("src/gh-config.js", "src/app-config.js");
```
2. **Modify `package.json`**:
- In the `scripts` section of your `package.json`, replace Unix-style `cp` commands with our new Node.js script. For example:
```json
"scripts": {
"start": "node copy-config.js && parcel src/llm_chat.html --port 8888",
"mlc-local": "node copy-config.js && parcel src/llm_chat.html --port 8888",
"build": "node copy-config.js && parcel build src/llm_chat.html --dist-dir lib --no-content-hash"
},
```
3. **Run the Application**:
- Save your changes and run `npm start` in CMD or PowerShell to start the application.
================================================
FILE: examples/simple-chat-upload/package.json
================================================
{
"name": "simple-chat",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "cp src/gh-config.js src/app-config.js && parcel src/llm_chat.html --port 8883",
"build": "cp src/gh-config.js src/app-config.js && parcel build src/llm_chat.html --dist-dir lib --no-content-hash"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/simple-chat-upload/src/gh-config.js
================================================
import { prebuiltAppConfig } from "@mlc-ai/web-llm";
export default {
model_list: prebuiltAppConfig.model_list,
use_web_worker: true,
};
================================================
FILE: examples/simple-chat-upload/src/llm_chat.css
================================================
.chatui {
display: flex;
position: relative;
flex-flow: column wrap;
justify-content: space-between;
width: 100%;
max-width: 867px;
margin: 25px 10px;
height: 600px;
border: 2px solid #ddd;
border-radius: 5px;
background-color: #1f2027;
}
.chatui-select-wrapper {
display: flex;
justify-content: center;
background-color: #1f2027;
padding: 10px 0;
}
#chatui-select {
width: 350px;
background-color: #1f2027;
color: white;
border: none;
}
#chatui-select:focus {
outline: none;
}
#chatui-select::-webkit-scrollbar {
display: none;
}
#chatui-select option {
background-color: #1f2027;
color: white;
}
#chatui-select option:hover {
background-color: #474747;
color: white;
}
s .chatui-header {
display: flex;
justify-content: space-between;
padding: 10px;
border-bottom: 2px solid #ddd;
background: #eee;
color: #666;
}
/* Used to remove tiny white lines in android devices; not sure if there is a better way */
*,
*::before,
*::after {
box-sizing: content-box;
}
.chatui-chat {
flex: 1;
overflow-y: auto;
padding: 10px;
background-color: #1f2027;
}
.chatui-chat::-webkit-scrollbar {
width: 6px;
}
.chatui-chat::-webkit-scrollbar-track {
background: #1f2027;
}
.chatui-chat::-webkit-scrollbar-thumb {
background: #888;
}
.chatui-chat::-webkit-scrollbar-thumb:hover {
background: #555;
}
.msg {
display: flex;
align-items: flex-end;
margin-bottom: 10px;
}
.msg:last-of-type {
margin: 0;
}
.msg-bubble {
background-color: #f0f0f0;
border-radius: 8px;
padding: 16px;
margin: 5px auto;
width: calc(100% - 20px);
box-sizing: border-box;
color: black;
border: none;
font-size: medium;
margin-left: auto;
margin-right: auto;
}
.left-msg .msg-bubble {
background-color: #343541;
color: #ececec;
}
.error-msg .msg-bubble {
background-color: #343541;
color: #f15959;
}
.init-msg .msg-bubble {
background-color: #343541;
color: #ececec;
}
.right-msg .msg-bubble {
background-color: #444654;
color: #ececec;
}
.chatui-inputarea {
display: flex;
padding: 10px;
border-top: 2px solid transparent;
background-color: #1f2027;
}
.chatui-inputarea * {
padding: 10px;
border: none;
border-radius: 3px;
font-size: 1em;
color: white;
background: rgba(0, 0, 0, 0.3);
}
.chatui-input {
flex: 1;
background-color: #40414f;
color: white;
}
.chatui-reset-btn {
margin-left: 10px;
background-color: #40414f;
color: #fff;
font-weight: bold;
cursor: pointer;
background-image: url("img/reset.png");
background-repeat: no-repeat;
background-position: center;
width: 40px;
background-repeat: no-repeat;
background-position: center;
background-size: 20px 20px;
}
.chatui-reset-btn:hover {
background-color: #03a33e;
}
.chatui-send-btn {
margin-left: 10px;
background-color: #40414f;
color: #fff;
font-weight: bold;
cursor: pointer;
background-image: url("img/plane.png");
background-repeat: no-repeat;
background-position: center;
width: 40px;
background-repeat: no-repeat;
background-position: center;
background-size: 20px 20px;
}
.chatui-send-btn:hover {
background-color: #03a33e;
}
================================================
FILE: examples/simple-chat-upload/src/llm_chat.html
================================================
/>
================================================
FILE: examples/simple-chat-upload/src/simple_chat.ts
================================================
import appConfig from "./app-config";
import * as webllm from "@mlc-ai/web-llm";
function getElementAndCheck(id: string): HTMLElement {
const element = document.getElementById(id);
if (element == null) {
throw Error("Cannot find element " + id);
}
return element;
}
class ChatUI {
private uiChat: HTMLElement;
private uiChatInput: HTMLInputElement;
private uiChatInfoLabel: HTMLLabelElement;
private engine: webllm.MLCEngineInterface | webllm.WebWorkerMLCEngine;
private config: webllm.AppConfig = appConfig;
private selectedModel: string;
private chatLoaded = false;
private requestInProgress = false;
private chatHistory: webllm.ChatCompletionMessageParam[] = [];
// We use a request chain to ensure that
// all requests send to chat are sequentialized
private chatRequestChain: Promise = Promise.resolve();
/**
* An asynchronous factory constructor since we need to await getMaxStorageBufferBindingSize();
* this is not allowed in a constructor (which cannot be asynchronous).
*/
public static CreateAsync = async (engine: webllm.MLCEngineInterface) => {
const chatUI = new ChatUI();
chatUI.engine = engine;
// get the elements
chatUI.uiChat = getElementAndCheck("chatui-chat");
chatUI.uiChatInput = getElementAndCheck("chatui-input") as HTMLInputElement;
chatUI.uiChatInfoLabel = getElementAndCheck(
"chatui-info-label",
) as HTMLLabelElement;
// register event handlers
getElementAndCheck("chatui-reset-btn").onclick = () => {
chatUI.onReset();
};
getElementAndCheck("chatui-send-btn").onclick = () => {
chatUI.onGenerate();
};
// TODO: find other alternative triggers
getElementAndCheck("chatui-input").onkeypress = (event) => {
if (event.keyCode === 13) {
chatUI.onGenerate();
}
};
// When we detect low maxStorageBufferBindingSize, we assume that the device (e.g. an Android
// phone) can only handle small models and make all other models unselectable. Otherwise, the
// browser may crash. See https://github.com/mlc-ai/web-llm/issues/209.
// Also use GPU vendor to decide whether it is a mobile device (hence with limited resources).
const androidMaxStorageBufferBindingSize = 1 << 27; // 128MB
const mobileVendors = new Set(["qualcomm", "arm"]);
let restrictModels = false;
let maxStorageBufferBindingSize: number;
let gpuVendor: string;
try {
[maxStorageBufferBindingSize, gpuVendor] = await Promise.all([
engine.getMaxStorageBufferBindingSize(),
engine.getGPUVendor(),
]);
} catch (err) {
chatUI.appendMessage("error", "Init error, " + err.toString());
console.log(err.stack);
return;
}
if (
(gpuVendor.length != 0 && mobileVendors.has(gpuVendor)) ||
maxStorageBufferBindingSize <= androidMaxStorageBufferBindingSize
) {
chatUI.appendMessage(
"init",
"Your device seems to have " +
"limited resources, so we restrict the selectable models.",
);
restrictModels = true;
}
// Populate modelSelector
const modelSelector = getElementAndCheck(
"chatui-select",
) as HTMLSelectElement;
for (let i = 0; i < chatUI.config.model_list.length; ++i) {
const item = chatUI.config.model_list[i];
const opt = document.createElement("option");
opt.value = item.model_id;
opt.innerHTML = item.model_id;
opt.selected = i == 0;
if (
(restrictModels &&
(item.low_resource_required === undefined ||
!item.low_resource_required)) ||
(item.buffer_size_required_bytes &&
maxStorageBufferBindingSize < item.buffer_size_required_bytes)
) {
// Either on a low-resource device and not a low-resource model
// Or device's maxStorageBufferBindingSize does not satisfy the model's need (if specified)
const params = new URLSearchParams(location.search);
opt.disabled = !params.has("bypassRestrictions");
opt.selected = false;
}
if (
!modelSelector.lastChild?.textContent?.startsWith(
opt.value.split("-")[0],
)
) {
modelSelector.appendChild(document.createElement("hr"));
}
modelSelector.appendChild(opt);
}
modelSelector.appendChild(document.createElement("hr"));
chatUI.selectedModel = modelSelector.value;
modelSelector.onchange = () => {
chatUI.onSelectChange(modelSelector);
};
return chatUI;
};
/**
* Push a task to the execution queue.
*
* @param task The task to be executed;
*/
private pushTask(task: () => Promise) {
const lastEvent = this.chatRequestChain;
this.chatRequestChain = lastEvent.then(task);
}
// Event handlers
// all event handler pushes the tasks to a queue
// that get executed sequentially
// the tasks previous tasks, which causes them to early stop
// can be interrupted by engine.interruptGenerate
private async onGenerate() {
if (this.requestInProgress) {
return;
}
this.pushTask(async () => {
await this.asyncGenerate();
});
}
private async onSelectChange(modelSelector: HTMLSelectElement) {
if (this.requestInProgress) {
// interrupt previous generation if any
this.engine.interruptGenerate();
}
// try reset after previous requests finishes
this.pushTask(async () => {
await this.engine.resetChat();
this.resetChatHistory();
await this.unloadChat();
this.selectedModel = modelSelector.value;
await this.asyncInitChat();
});
}
private async onReset() {
if (this.requestInProgress) {
// interrupt previous generation if any
this.engine.interruptGenerate();
}
// try reset after previous requests finishes
this.pushTask(async () => {
await this.engine.resetChat();
this.resetChatHistory();
});
}
// Internal helper functions
private appendMessage(kind, text) {
if (kind == "init") {
text = "[System Initalize] " + text;
}
if (this.uiChat === undefined) {
throw Error("cannot find ui chat");
}
const msg = `
${text}
`;
this.uiChat.insertAdjacentHTML("beforeend", msg);
this.uiChat.scrollTo(0, this.uiChat.scrollHeight);
}
private updateLastMessage(kind, text) {
if (kind == "init") {
text = "[System Initalize] " + text;
}
if (this.uiChat === undefined) {
throw Error("cannot find ui chat");
}
const matches = this.uiChat.getElementsByClassName(`msg ${kind}-msg`);
if (matches.length == 0) throw Error(`${kind} message do not exist`);
const msg = matches[matches.length - 1];
const msgText = msg.getElementsByClassName("msg-text");
if (msgText.length != 1) throw Error("Expect msg-text");
if (msgText[0].innerHTML == text) return;
const list = text.split("\n").map((t) => {
const item = document.createElement("div");
item.textContent = t;
return item;
});
msgText[0].innerHTML = "";
list.forEach((item) => msgText[0].append(item));
this.uiChat.scrollTo(0, this.uiChat.scrollHeight);
}
private resetChatHistory() {
this.chatHistory = [];
const clearTags = ["left", "right", "init", "error"];
for (const tag of clearTags) {
// need to unpack to list so the iterator don't get affected by mutation
const matches = [...this.uiChat.getElementsByClassName(`msg ${tag}-msg`)];
for (const item of matches) {
this.uiChat.removeChild(item);
}
}
if (this.uiChatInfoLabel !== undefined) {
this.uiChatInfoLabel.innerHTML = "";
}
}
private async asyncInitChat() {
if (this.chatLoaded) return;
this.requestInProgress = true;
this.appendMessage("init", "");
const initProgressCallback = (report) => {
this.updateLastMessage("init", report.text);
};
this.engine.setInitProgressCallback(initProgressCallback);
try {
await this.engine.reload(this.selectedModel);
} catch (err) {
this.appendMessage("error", "Init error, " + err.toString());
console.log(err.stack);
this.unloadChat();
this.requestInProgress = false;
return;
}
this.requestInProgress = false;
this.chatLoaded = true;
}
private async unloadChat() {
await this.engine.unload();
this.chatLoaded = false;
}
/**
* Run generate
*/
private async asyncGenerate() {
await this.asyncInitChat();
this.requestInProgress = true;
const prompt = this.uiChatInput.value;
if (prompt == "") {
this.requestInProgress = false;
return;
}
this.appendMessage("right", prompt);
this.uiChatInput.value = "";
this.uiChatInput.setAttribute("placeholder", "Generating...");
this.appendMessage("left", "");
this.chatHistory.push({ role: "user", content: prompt });
try {
let curMessage = "";
let usage: webllm.CompletionUsage | undefined = undefined;
const completion = await this.engine.chat.completions.create({
stream: true,
messages: this.chatHistory,
stream_options: { include_usage: true },
});
// TODO(Charlie): Processing of � requires changes
for await (const chunk of completion) {
const curDelta = chunk.choices[0]?.delta.content;
if (curDelta) {
curMessage += curDelta;
}
this.updateLastMessage("left", curMessage);
if (chunk.usage) {
usage = chunk.usage;
}
}
if (usage) {
this.uiChatInfoLabel.innerHTML =
`prompt_tokens: ${usage.prompt_tokens}, ` +
`completion_tokens: ${usage.completion_tokens}, ` +
`prefill: ${usage.extra.prefill_tokens_per_s.toFixed(4)} tokens/sec, ` +
`decoding: ${usage.extra.decode_tokens_per_s.toFixed(4)} tokens/sec`;
}
const finalMessage = await this.engine.getMessage();
this.updateLastMessage("left", finalMessage); // TODO: Remove this after � issue is fixed
this.chatHistory.push({ role: "assistant", content: finalMessage });
} catch (err) {
this.appendMessage("error", "Generate error, " + err.toString());
console.log(err.stack);
await this.unloadChat();
}
this.uiChatInput.setAttribute("placeholder", "Enter your message...");
this.requestInProgress = false;
}
}
const useWebWorker = appConfig.use_web_worker;
let engine: webllm.MLCEngineInterface;
// Here we do not use `CreateMLCEngine()` but instantiate an engine that is not loaded with model
if (useWebWorker) {
engine = new webllm.WebWorkerMLCEngine(
new Worker(new URL("./worker.ts", import.meta.url), { type: "module" }),
{ appConfig },
);
} else {
engine = new webllm.MLCEngine({ appConfig });
}
ChatUI.CreateAsync(engine);
function getFileType(file: File) {
if (file.name.includes("wasm")) {
return "webllm/wasm";
} else if (
file.name.includes(".bin") ||
file.name.includes("ndarray-cache.json")
) {
return "webllm/model";
} else if (file.name.includes("mlc-chat-config.json")) {
return "webllm/config";
} else {
console.log("No model file suffix found");
return "file-cache";
}
}
async function uploadToIndexedDB(file: File) {
let db;
const request = indexedDB.open(getFileType(file), 1);
request.onupgradeneeded = (event) => {
db = (event.target as IDBOpenDBRequest).result;
if (!db.objectStoreNames.contains("urls")) {
db.createObjectStore("urls", { keyPath: "url" });
}
};
request.onsuccess = (event) => {
db = (event.target as IDBOpenDBRequest).result;
};
request.onerror = (event) => {
console.error("Database error: ", (event.target as IDBOpenDBRequest).error);
};
const transaction = db.transaction("files", "readwrite");
const store = transaction.objectStore("files");
const reader = new FileReader();
reader.onload = async (e) => {
if (e.target === null || e.target.result === null) {
console.error("Do not read any files");
return;
}
const url = file.name;
store.add(e.target.result, url);
};
transaction.oncomplete = function () {
alert("All files have been uploaded to IndexedDB.");
};
transaction.onerror = function (event) {
console.error("Error uploading files:", event);
};
}
async function cacheFile(file: File, response: Response) {
try {
const cache = await caches.open(getFileType(file)); // Ensure getFileType is a synchronous function or awaited if async
console.log("Put response into cache:", response);
await cache.put(file.name, response);
} catch (error) {
console.error("Failed to cache the file:", error);
}
}
async function uploadFiles(): Promise {
const input = document.getElementById("file-input") as HTMLInputElement;
if (!input.files || input.files.length === 0) {
alert("No files selected.");
return;
}
if (appConfig.useIndexedDBCache) {
for (const file of input.files) {
uploadToIndexedDB(file);
}
} else {
for (const file of input.files) {
const reader = new FileReader();
reader.onload = async (e) => {
if (e.target === null || e.target.result === null) {
console.error("Do not read any files");
return;
}
const arrayBuffer = e.target.result as ArrayBuffer;
const response = new Response(arrayBuffer, {
status: 200,
statusText: "OK",
headers: {
"Content-Type": "application/octet-stream",
"Content-Length": arrayBuffer.byteLength.toString(),
},
});
await cacheFile(file, response);
};
if (
file.name.includes("mlc-chat-config.json") ||
file.name.includes("ndarray-cache.json")
) {
reader.readAsText(file);
} else {
reader.readAsArrayBuffer(file);
}
}
}
}
(window as any).uploadFiles = uploadFiles;
================================================
FILE: examples/simple-chat-upload/src/worker.ts
================================================
// Serve the engine workload through web worker
import { WebWorkerMLCEngineHandler } from "@mlc-ai/web-llm";
const handler = new WebWorkerMLCEngineHandler();
self.onmessage = (msg: MessageEvent) => {
handler.onmessage(msg);
};
================================================
FILE: examples/streaming/README.md
================================================
### OpenAI API Demos
Run `npm install` first, followed by `npm start`.
Note if you would like to hack WebLLM core package,
you can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/streaming/package.json
================================================
{
"name": "streaming",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/streaming.html --port 8888",
"build": "parcel build src/streaming.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/streaming/src/streaming.html
================================================
WebLLM Test Page
Open console to see output
Response
================================================
FILE: examples/streaming/src/streaming.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
/**
* We demonstrate chat completion with streaming, where delta is sent while generating response.
*/
async function main() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{ initProgressCallback: initProgressCallback },
);
const request: webllm.ChatCompletionRequest = {
stream: true,
stream_options: { include_usage: true },
messages: [
{
role: "system",
content:
"You are a pirate chatbot who always responds in pirate speak!",
},
{ role: "user", content: "Who are you?" },
],
logprobs: true,
top_logprobs: 2,
};
const asyncChunkGenerator = await engine.chat.completions.create(request);
let message = "";
for await (const chunk of asyncChunkGenerator) {
console.log(chunk);
message += chunk.choices[0]?.delta?.content || "";
setLabel("generate-label", message);
if (chunk.usage) {
console.log(chunk.usage); // only last chunk has usage
}
// engine.interruptGenerate(); // works with interrupt as well
}
console.log("Final message:\n", await engine.getMessage()); // the concatenated message
}
main();
================================================
FILE: examples/text-completion/README.md
================================================
# WebLLM Get Started App
This folder provides a minimum demo to show WebLLM API in a webapp setting.
To try it out, you can do the following steps under this folder
```bash
npm install
npm start
```
Note if you would like to hack WebLLM core package.
You can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/text-completion/package.json
================================================
{
"name": "text-completion",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/text_completion.html --port 8888",
"build": "parcel build src/text_completion.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/text-completion/src/text_completion.html
================================================
WebLLM Test Page
Open console to see output
Prompt
Response
================================================
FILE: examples/text-completion/src/text_completion.ts
================================================
import * as webllm from "@mlc-ai/web-llm";
function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}
async function main() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
// Unlike "Llama-3.1-8B-Instruct-q4f32_1-MLC", this is a base model
const selectedModel = "Llama-3.1-8B-q4f32_1-MLC";
const appConfig: webllm.AppConfig = {
model_list: [
{
model: "https://huggingface.co/mlc-ai/Llama-3.1-8B-q4f32_1-MLC", // a base model
model_id: selectedModel,
model_lib:
webllm.modelLibURLPrefix +
webllm.modelVersion +
"/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm",
overrides: {
context_window_size: 2048,
},
},
],
};
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{
appConfig: appConfig,
initProgressCallback: initProgressCallback,
logLevel: "INFO",
},
);
const reply0 = await engine.completions.create({
prompt: "List 3 US states: ",
// below configurations are all optional
echo: true,
n: 2,
max_tokens: 64,
logprobs: true,
top_logprobs: 2,
});
console.log(reply0);
console.log(reply0.usage);
// To change model, either create a new engine via `CreateMLCEngine()`, or call `engine.reload(modelId)`
}
main();
================================================
FILE: examples/vision-model/README.md
================================================
# WebLLM Get Started App
This folder provides a minimum demo to show WebLLM API in a webapp setting.
To try it out, you can do the following steps under this folder
```bash
npm install
npm start
```
Note if you would like to hack WebLLM core package.
You can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
================================================
FILE: examples/vision-model/package.json
================================================
{
"name": "get-started",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/vision_model.html --port 8888",
"build": "parcel build src/vision_model.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.80"
}
}
================================================
FILE: examples/vision-model/src/utils.ts
================================================
export function getImageDataFromURL(url: string): Promise {
return new Promise((resolve, reject) => {
// Converts img to any, and later `as CanvasImageSource`, otherwise build complains
const img: any = new Image();
img.crossOrigin = "anonymous"; // Important for CORS
img.onload = () => {
const canvas: HTMLCanvasElement = document.createElement("canvas");
const ctx: CanvasRenderingContext2D = canvas.getContext("2d")!;
canvas.width = img.width;
canvas.height = img.height;
ctx.drawImage(img as CanvasImageSource, 0, 0);
const imageData = ctx.getImageData(0, 0, img.width, img.height);
resolve(imageData);
};
img.onerror = () => reject(new Error("Failed to load image"));
img.src = url;
});
}
export async function imageURLToBase64(url: string): Promise {
const imageData: ImageData = await getImageDataFromURL(url);
const canvas = document.createElement("canvas");
const ctx = canvas.getContext("2d");
canvas.width = imageData.width;
canvas.height = imageData.height;
ctx!.putImageData(imageData, 0, 0);
return canvas.toDataURL();
}
================================================
FILE: examples/vision-model/src/vision_model.html
================================================