@@ -2371,13 +2371,34 @@ struct llama_context {
23712371 struct llama_control_vector cvec;
23722372};
23732373
2374+ static size_t llama_get_device_count(const llama_model & model) {
2375+ size_t count = 1;
2376+ #if defined(GGML_USE_CUDA)
2377+ count = ggml_backend_cuda_get_device_count();
2378+ #elif defined(GGML_USE_SYCL)
2379+ count = ggml_backend_sycl_get_device_count();
2380+ #elif defined(GGML_USE_VULKAN)
2381+ count = ggml_backend_vk_get_device_count();
2382+ #endif
2383+ #if defined(GGML_USE_RPC)
2384+ count += model.rpc_servers.size();
2385+ #endif
2386+ return count;
2387+ GGML_UNUSED(model);
2388+ }
2389+
23742390static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
23752391 ggml_backend_buffer_type_t buft = nullptr;
23762392
2377- #ifdef GGML_USE_RPC
2378- std::string endpoint = model.rpc_servers[gpu];
2379- buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2380- #elif defined(GGML_USE_METAL)
2393+ #if defined(GGML_USE_RPC)
2394+ int dev_count = (int)llama_get_device_count(model);
2395+ int rpc_count = (int)model.rpc_servers.size();
2396+ if (gpu >= dev_count - rpc_count) {
2397+ const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
2398+ return ggml_backend_rpc_buffer_type(endpoint);
2399+ }
2400+ #endif
2401+ #if defined(GGML_USE_METAL)
23812402 buft = ggml_backend_metal_buffer_type();
23822403#elif defined(GGML_USE_CUDA)
23832404 buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2425,29 +2446,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
24252446 GGML_UNUSED(tensor_split);
24262447}
24272448
2428- static size_t llama_get_device_count(const llama_model & model) {
2429- #if defined(GGML_USE_RPC)
2430- return model.rpc_servers.size();
2431- #elif defined(GGML_USE_CUDA)
2432- return ggml_backend_cuda_get_device_count();
2433- #elif defined(GGML_USE_SYCL)
2434- return ggml_backend_sycl_get_device_count();
2435- #elif defined(GGML_USE_VULKAN)
2436- return ggml_backend_vk_get_device_count();
2437- #else
2438- return 1;
2439- #endif
2440- GGML_UNUSED(model);
2441- }
2442-
24432449static size_t llama_get_device_memory(const llama_model & model, int device) {
24442450#if defined(GGML_USE_RPC)
2445- size_t total;
2446- size_t free;
2447- std::string endpoint = model.rpc_servers[device];
2448- ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2449- return free;
2450- #elif defined(GGML_USE_CUDA)
2451+ int dev_count = (int)llama_get_device_count(model);
2452+ int rpc_count = (int)model.rpc_servers.size();
2453+ if (device >= dev_count - rpc_count) {
2454+ size_t total;
2455+ size_t free;
2456+ const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
2457+ ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
2458+ return free;
2459+ }
2460+ #endif
2461+ #if defined(GGML_USE_CUDA)
24512462 size_t total;
24522463 size_t free;
24532464 ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -16160,7 +16171,7 @@ struct llama_model * llama_load_model_from_file(
1616016171 return true;
1616116172 };
1616216173 }
16163- if (params.rpc_servers != nullptr) {
16174+ if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0' ) {
1616416175 // split the servers set them into model->rpc_servers
1616516176 std::string servers(params.rpc_servers);
1616616177 size_t pos = 0;
@@ -16323,17 +16334,7 @@ struct llama_context * llama_new_context_with_model(
1632316334
1632416335 if (!hparams.vocab_only) {
1632516336 // initialize backends
16326- #if defined(GGML_USE_RPC)
16327- for (auto & server : model->rpc_servers) {
16328- ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
16329- if (backend == nullptr) {
16330- LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
16331- llama_free(ctx);
16332- return nullptr;
16333- }
16334- ctx->backends.push_back(backend);
16335- }
16336- #elif defined(GGML_USE_METAL)
16337+ #if defined(GGML_USE_METAL)
1633716338 if (model->n_gpu_layers > 0) {
1633816339 ctx->backend_metal = ggml_backend_metal_init();
1633916340 if (ctx->backend_metal == nullptr) {
@@ -16425,6 +16426,19 @@ struct llama_context * llama_new_context_with_model(
1642516426 }
1642616427 ctx->backends.push_back(backend);
1642716428 }
16429+ #endif
16430+ #if defined(GGML_USE_RPC)
16431+ if (model->n_gpu_layers > 0) {
16432+ for (const auto & endpoint : model->rpc_servers) {
16433+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
16434+ if (backend == nullptr) {
16435+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
16436+ llama_free(ctx);
16437+ return nullptr;
16438+ }
16439+ ctx->backends.push_back(backend);
16440+ }
16441+ }
1642816442#endif
1642916443 ctx->backend_cpu = ggml_backend_cpu_init();
1643016444 if (ctx->backend_cpu == nullptr) {
0 commit comments