Skip to content

Commit ed300c5

Browse files
wjsichaokunyang
andauthored
[BACKPORT][Ray] Use main pool as owner when autoscale disabled (#2878) (#2903)
Co-authored-by: Shawn <[email protected]>
1 parent 2152931 commit ed300c5

File tree

8 files changed

+45
-26
lines changed

8 files changed

+45
-26
lines changed

.github/workflows/cancel-prev.yml

Lines changed: 0 additions & 12 deletions
This file was deleted.

.github/workflows/core-ci.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ on:
77
pull_request:
88
types: ['opened', 'reopened', 'synchronize']
99

10+
concurrency:
11+
group: ${{ github.workflow }}-${{ github.ref }}
12+
cancel-in-progress: true
13+
1014
jobs:
1115
build:
1216
runs-on: ${{ matrix.os }}

.github/workflows/docker-cd.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ on:
77
tags:
88
- '*'
99

10+
concurrency:
11+
group: ${{ github.workflow }}-${{ github.ref }}
12+
cancel-in-progress: true
13+
1014
jobs:
1115
build:
1216
runs-on: ubuntu-latest

.github/workflows/os-compat-ci.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ on:
77
pull_request:
88
types: ['opened', 'reopened', 'synchronize']
99

10+
concurrency:
11+
group: ${{ github.workflow }}-${{ github.ref }}
12+
cancel-in-progress: true
13+
1014
jobs:
1115
build:
1216
runs-on: ${{ matrix.os }}

.github/workflows/platform-ci.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ on:
77
pull_request:
88
types: ['opened', 'reopened', 'synchronize']
99

10+
concurrency:
11+
group: ${{ github.workflow }}-${{ github.ref }}
12+
cancel-in-progress: true
13+
1014
jobs:
1115
build:
1216
runs-on: ${{ matrix.os }}

.github/workflows/pypi-cd.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ on:
55
tags:
66
- '*'
77

8+
concurrency:
9+
group: ${{ github.workflow }}-${{ github.ref }}
10+
cancel-in-progress: true
11+
812
jobs:
913
build:
1014
name: Build wheels on ${{ matrix.os }} for ${{ matrix.arch }}

mars/services/storage/core.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -550,17 +550,6 @@ async def _setup_storage(
550550
):
551551
backend = get_storage_backend(storage_backend)
552552
storage_config = storage_config or dict()
553-
554-
from ..cluster import ClusterAPI
555-
556-
if backend.name == "ray":
557-
try:
558-
cluster_api = await ClusterAPI.create(self.address)
559-
supervisor_address = (await cluster_api.get_supervisors())[0]
560-
# ray storage backend need to set supervisor as owner to avoid data lost when worker dies.
561-
storage_config["owner"] = supervisor_address
562-
except mo.ActorNotExist:
563-
pass
564553
init_params, teardown_params = await backend.setup(**storage_config)
565554
client = backend(**init_params)
566555
self._init_params[band_name][storage_backend] = init_params

mars/services/storage/worker/service.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,31 @@ async def start(self):
3636
backends = storage_configs.get("backends")
3737
options = storage_configs.get("default_config", dict())
3838
transfer_block_size = options.get("transfer_block_size", None)
39-
backend_config = {
40-
backend: storage_configs.get(backend, dict()) for backend in backends
41-
}
39+
backend_config = {}
40+
for backend in backends:
41+
storage_config = storage_configs.get(backend, dict())
42+
backend_config[backend] = storage_config
43+
if backend == "ray":
44+
# Specify supervisor as ray owner will be costly when mars do shuffle which there will be m*n objects
45+
# need to specify supervisor as owner, so enable it only for auto scale to avoid data lost when scale
46+
# in. This limit can be removed when ray support ownership transfer.
47+
if (
48+
self._config.get("scheduling", {})
49+
.get("autoscale", {})
50+
.get("enabled", False)
51+
):
52+
try:
53+
from ...cluster.api import ClusterAPI
54+
55+
cluster_api = await ClusterAPI.create(self._address)
56+
supervisor_address = (await cluster_api.get_supervisors())[0]
57+
# ray storage backend need to set supervisor as owner to avoid data lost when worker dies.
58+
owner = supervisor_address
59+
except mo.ActorNotExist:
60+
owner = self._address
61+
else:
62+
owner = self._address
63+
storage_config["owner"] = owner
4264

4365
await mo.create_actor(
4466
StorageManagerActor,

0 commit comments

Comments
 (0)