@@ -2370,34 +2370,13 @@ struct llama_context {
23702370 struct llama_control_vector cvec;
23712371};
23722372
2373- static size_t llama_get_device_count(const llama_model & model) {
2374- size_t count = 1;
2375- #if defined(GGML_USE_CUDA)
2376- count = ggml_backend_cuda_get_device_count();
2377- #elif defined(GGML_USE_SYCL)
2378- count = ggml_backend_sycl_get_device_count();
2379- #elif defined(GGML_USE_VULKAN)
2380- count = ggml_backend_vk_get_device_count();
2381- #endif
2382- #if defined(GGML_USE_RPC)
2383- count += model.rpc_servers.size();
2384- #endif
2385- return count;
2386- GGML_UNUSED(model);
2387- }
2388-
23892373static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
23902374 ggml_backend_buffer_type_t buft = nullptr;
23912375
2392- #if defined(GGML_USE_RPC)
2393- int dev_count = (int)llama_get_device_count(model);
2394- int rpc_count = (int)model.rpc_servers.size();
2395- if (gpu >= dev_count - rpc_count) {
2396- const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
2397- return ggml_backend_rpc_buffer_type(endpoint);
2398- }
2399- #endif
2400- #if defined(GGML_USE_METAL)
2376+ #ifdef GGML_USE_RPC
2377+ std::string endpoint = model.rpc_servers[gpu];
2378+ buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2379+ #elif defined(GGML_USE_METAL)
24012380 buft = ggml_backend_metal_buffer_type();
24022381#elif defined(GGML_USE_CUDA)
24032382 buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2443,19 +2422,29 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
24432422 GGML_UNUSED(tensor_split);
24442423}
24452424
2446- static size_t llama_get_device_memory (const llama_model & model, int device ) {
2425+ static size_t llama_get_device_count (const llama_model & model) {
24472426#if defined(GGML_USE_RPC)
2448- int dev_count = (int)llama_get_device_count( model);
2449- int rpc_count = (int)model.rpc_servers.size();
2450- if (device >= dev_count - rpc_count) {
2451- size_t total;
2452- size_t free ;
2453- const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
2454- ggml_backend_rpc_get_device_memory(endpoint, &free, &total );
2455- return free;
2456- }
2427+ return model.rpc_servers.size( );
2428+ #elif defined(GGML_USE_CUDA)
2429+ return ggml_backend_cuda_get_device_count();
2430+ #elif defined(GGML_USE_SYCL)
2431+ return ggml_backend_sycl_get_device_count() ;
2432+ #elif defined(GGML_USE_VULKAN)
2433+ return ggml_backend_vk_get_device_count( );
2434+ #else
2435+ return 1;
24572436#endif
2458- #if defined(GGML_USE_CUDA)
2437+ GGML_UNUSED(model);
2438+ }
2439+
2440+ static size_t llama_get_device_memory(const llama_model & model, int device) {
2441+ #if defined(GGML_USE_RPC)
2442+ size_t total;
2443+ size_t free;
2444+ std::string endpoint = model.rpc_servers[device];
2445+ ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2446+ return free;
2447+ #elif defined(GGML_USE_CUDA)
24592448 size_t total;
24602449 size_t free;
24612450 ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -15995,7 +15984,7 @@ struct llama_model * llama_load_model_from_file(
1599515984 return true;
1599615985 };
1599715986 }
15998- if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0' ) {
15987+ if (params.rpc_servers != nullptr) {
1599915988 // split the servers set them into model->rpc_servers
1600015989 std::string servers(params.rpc_servers);
1600115990 size_t pos = 0;
@@ -16158,7 +16147,17 @@ struct llama_context * llama_new_context_with_model(
1615816147
1615916148 if (!hparams.vocab_only) {
1616016149 // initialize backends
16161- #if defined(GGML_USE_METAL)
16150+ #if defined(GGML_USE_RPC)
16151+ for (auto & server : model->rpc_servers) {
16152+ ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
16153+ if (backend == nullptr) {
16154+ LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
16155+ llama_free(ctx);
16156+ return nullptr;
16157+ }
16158+ ctx->backends.push_back(backend);
16159+ }
16160+ #elif defined(GGML_USE_METAL)
1616216161 if (model->n_gpu_layers > 0) {
1616316162 ctx->backend_metal = ggml_backend_metal_init();
1616416163 if (ctx->backend_metal == nullptr) {
@@ -16250,19 +16249,6 @@ struct llama_context * llama_new_context_with_model(
1625016249 }
1625116250 ctx->backends.push_back(backend);
1625216251 }
16253- #endif
16254- #if defined(GGML_USE_RPC)
16255- if (model->n_gpu_layers > 0) {
16256- for (const auto & endpoint : model->rpc_servers) {
16257- ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
16258- if (backend == nullptr) {
16259- LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
16260- llama_free(ctx);
16261- return nullptr;
16262- }
16263- ctx->backends.push_back(backend);
16264- }
16265- }
1626616252#endif
1626716253 ctx->backend_cpu = ggml_backend_cpu_init();
1626816254 if (ctx->backend_cpu == nullptr) {
0 commit comments