@@ -467,6 +467,7 @@ struct llama_server_context
467
467
bool all_slots_are_idle = false ;
468
468
bool add_bos_token = true ;
469
469
bool has_eos_token = true ;
470
+ bool has_gpu = false ;
470
471
471
472
bool grammar_lazy = false ;
472
473
std::vector<common_grammar_trigger> grammar_triggers;
@@ -511,7 +512,10 @@ struct llama_server_context
511
512
if (!params.mmproj .empty ()) {
512
513
multimodal = true ;
513
514
LOG_INFO (" Multi Modal Mode Enabled" , {});
514
- clp_ctx = clip_model_load (params.mmproj .c_str (), /* verbosity=*/ 1 );
515
+ clp_ctx = clip_init (params.mmproj .c_str (), clip_context_params {
516
+ /* use_gpu */ has_gpu,
517
+ /* verbosity=*/ 1 ,
518
+ });
515
519
if (clp_ctx == nullptr ) {
516
520
LOG_ERR (" unable to load clip model: %s" , params.mmproj .c_str ());
517
521
return false ;
@@ -2314,7 +2318,7 @@ static std::string get_all_kv_cache_types() {
2314
2318
}
2315
2319
2316
2320
static void params_parse (const backend::ModelOptions* request,
2317
- common_params & params) {
2321
+ common_params & params, llama_server_context &llama ) {
2318
2322
2319
2323
// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
2320
2324
@@ -2352,6 +2356,20 @@ static void params_parse(const backend::ModelOptions* request,
2352
2356
add_rpc_devices (std::string (llama_grpc_servers));
2353
2357
}
2354
2358
2359
+ // decode options. Options are in form optname:optvale, or if booleans only optname.
2360
+ for (int i = 0 ; i < request->options_size (); i++) {
2361
+ std::string opt = request->options (i);
2362
+ char *optname = strtok (&opt[0 ], " :" );
2363
+ char *optval = strtok (NULL , " :" );
2364
+ if (optval == NULL ) {
2365
+ optval = " true" ;
2366
+ }
2367
+
2368
+ if (!strcmp (optname, " gpu" )) {
2369
+ llama.has_gpu = true ;
2370
+ }
2371
+ }
2372
+
2355
2373
// TODO: Add yarn
2356
2374
2357
2375
if (!request->tensorsplit ().empty ()) {
@@ -2445,7 +2463,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
2445
2463
grpc::Status LoadModel (ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
2446
2464
// Implement LoadModel RPC
2447
2465
common_params params;
2448
- params_parse (request, params);
2466
+ params_parse (request, params, llama );
2449
2467
2450
2468
llama_backend_init ();
2451
2469
llama_numa_init (params.numa );
0 commit comments