Skip to content

Commit 423514a

Browse files
authored
fix(clip): do not imply GPU offload by default (#5010)
* fix(clip): do not imply GPUs by default Until a better solution is found upstream, be conservative and default to GPU. ggml-org/llama.cpp#12322 ggml-org/llama.cpp#12322 (comment) Signed-off-by: Ettore Di Giacinto <[email protected]> * allow to override gpu via backend options Signed-off-by: Ettore Di Giacinto <[email protected]> --------- Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent 12568c7 commit 423514a

File tree

1 file changed

+21
-3
lines changed

1 file changed

+21
-3
lines changed

backend/cpp/llama/grpc-server.cpp

+21-3
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,7 @@ struct llama_server_context
467467
bool all_slots_are_idle = false;
468468
bool add_bos_token = true;
469469
bool has_eos_token = true;
470+
bool has_gpu = false;
470471

471472
bool grammar_lazy = false;
472473
std::vector<common_grammar_trigger> grammar_triggers;
@@ -511,7 +512,10 @@ struct llama_server_context
511512
if (!params.mmproj.empty()) {
512513
multimodal = true;
513514
LOG_INFO("Multi Modal Mode Enabled", {});
514-
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
515+
clp_ctx = clip_init(params.mmproj.c_str(), clip_context_params {
516+
/* use_gpu */ has_gpu,
517+
/*verbosity=*/ 1,
518+
});
515519
if(clp_ctx == nullptr) {
516520
LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
517521
return false;
@@ -2314,7 +2318,7 @@ static std::string get_all_kv_cache_types() {
23142318
}
23152319

23162320
static void params_parse(const backend::ModelOptions* request,
2317-
common_params & params) {
2321+
common_params & params, llama_server_context &llama) {
23182322

23192323
// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
23202324

@@ -2352,6 +2356,20 @@ static void params_parse(const backend::ModelOptions* request,
23522356
add_rpc_devices(std::string(llama_grpc_servers));
23532357
}
23542358

2359+
// decode options. Options are in form optname:optvale, or if booleans only optname.
2360+
for (int i = 0; i < request->options_size(); i++) {
2361+
std::string opt = request->options(i);
2362+
char *optname = strtok(&opt[0], ":");
2363+
char *optval = strtok(NULL, ":");
2364+
if (optval == NULL) {
2365+
optval = "true";
2366+
}
2367+
2368+
if (!strcmp(optname, "gpu")) {
2369+
llama.has_gpu = true;
2370+
}
2371+
}
2372+
23552373
// TODO: Add yarn
23562374

23572375
if (!request->tensorsplit().empty()) {
@@ -2445,7 +2463,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
24452463
grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
24462464
// Implement LoadModel RPC
24472465
common_params params;
2448-
params_parse(request, params);
2466+
params_parse(request, params, llama);
24492467

24502468
llama_backend_init();
24512469
llama_numa_init(params.numa);

0 commit comments

Comments
 (0)