@@ -2369,13 +2369,34 @@ struct llama_context {
23692369 struct llama_control_vector cvec;
23702370};
23712371
2372+ static size_t llama_get_device_count(const llama_model & model) {
2373+ size_t count = 1;
2374+ #if defined(GGML_USE_CUDA)
2375+ count = ggml_backend_cuda_get_device_count();
2376+ #elif defined(GGML_USE_SYCL)
2377+ count = ggml_backend_sycl_get_device_count();
2378+ #elif defined(GGML_USE_VULKAN)
2379+ count = ggml_backend_vk_get_device_count();
2380+ #endif
2381+ #if defined(GGML_USE_RPC)
2382+ count += model.rpc_servers.size();
2383+ #endif
2384+ return count;
2385+ GGML_UNUSED(model);
2386+ }
2387+
23722388static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
23732389 ggml_backend_buffer_type_t buft = nullptr;
23742390
2375- #ifdef GGML_USE_RPC
2376- std::string endpoint = model.rpc_servers[gpu];
2377- buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2378- #elif defined(GGML_USE_METAL)
2391+ #if defined(GGML_USE_RPC)
2392+ int dev_count = (int)llama_get_device_count(model);
2393+ int rpc_count = (int)model.rpc_servers.size();
2394+ if (gpu >= dev_count - rpc_count) {
2395+ const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
2396+ return ggml_backend_rpc_buffer_type(endpoint);
2397+ }
2398+ #endif
2399+ #if defined(GGML_USE_METAL)
23792400 buft = ggml_backend_metal_buffer_type();
23802401#elif defined(GGML_USE_CUDA)
23812402 buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2423,29 +2444,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
24232444 GGML_UNUSED(tensor_split);
24242445}
24252446
2426- static size_t llama_get_device_count(const llama_model & model) {
2427- #if defined(GGML_USE_RPC)
2428- return model.rpc_servers.size();
2429- #elif defined(GGML_USE_CUDA)
2430- return ggml_backend_cuda_get_device_count();
2431- #elif defined(GGML_USE_SYCL)
2432- return ggml_backend_sycl_get_device_count();
2433- #elif defined(GGML_USE_VULKAN)
2434- return ggml_backend_vk_get_device_count();
2435- #else
2436- return 1;
2437- #endif
2438- GGML_UNUSED(model);
2439- }
2440-
24412447static size_t llama_get_device_memory(const llama_model & model, int device) {
24422448#if defined(GGML_USE_RPC)
2443- size_t total;
2444- size_t free;
2445- std::string endpoint = model.rpc_servers[device];
2446- ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2447- return free;
2448- #elif defined(GGML_USE_CUDA)
2449+ int dev_count = (int)llama_get_device_count(model);
2450+ int rpc_count = (int)model.rpc_servers.size();
2451+ if (device >= dev_count - rpc_count) {
2452+ size_t total;
2453+ size_t free;
2454+ const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
2455+ ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
2456+ return free;
2457+ }
2458+ #endif
2459+ #if defined(GGML_USE_CUDA)
24492460 size_t total;
24502461 size_t free;
24512462 ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -16146,7 +16157,7 @@ struct llama_model * llama_load_model_from_file(
1614616157 return true;
1614716158 };
1614816159 }
16149- if (params.rpc_servers != nullptr) {
16160+ if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0' ) {
1615016161 // split the servers set them into model->rpc_servers
1615116162 std::string servers(params.rpc_servers);
1615216163 size_t pos = 0;
@@ -16304,17 +16315,7 @@ struct llama_context * llama_new_context_with_model(
1630416315
1630516316 if (!hparams.vocab_only) {
1630616317 // initialize backends
16307- #if defined(GGML_USE_RPC)
16308- for (auto & server : model->rpc_servers) {
16309- ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
16310- if (backend == nullptr) {
16311- LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
16312- llama_free(ctx);
16313- return nullptr;
16314- }
16315- ctx->backends.push_back(backend);
16316- }
16317- #elif defined(GGML_USE_METAL)
16318+ #if defined(GGML_USE_METAL)
1631816319 if (model->n_gpu_layers > 0) {
1631916320 ctx->backend_metal = ggml_backend_metal_init();
1632016321 if (ctx->backend_metal == nullptr) {
@@ -16406,6 +16407,18 @@ struct llama_context * llama_new_context_with_model(
1640616407 }
1640716408 ctx->backends.push_back(backend);
1640816409 }
16410+ #endif
16411+ #if defined(GGML_USE_RPC)
16412+ for (int i = 0; i < (int)model->rpc_servers.size(); i++) {
16413+ const char * endpoint = model->rpc_servers[i].c_str();
16414+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
16415+ if (backend == nullptr) {
16416+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint);
16417+ llama_free(ctx);
16418+ return nullptr;
16419+ }
16420+ ctx->backends.push_back(backend);
16421+ }
1640916422#endif
1641016423 ctx->backend_cpu = ggml_backend_cpu_init();
1641116424 if (ctx->backend_cpu == nullptr) {
0 commit comments