Aggregation's shard stuck in INITIALIZING

I made cluster m3dbnode + coordinator + aggregator
with `M3AGGREGATOR_HOST_ID: m3aggregator`

### Configurations:

node:

```
db:

  hostID:
    resolver: config
    value: "m3db-node-0"

  index:
    # The maximum number of outstanding QueryID requests to service concurrently
    maxQueryIDsConcurrency: 32
    # Limit on the max number of states used by a regexp deterministic finite automaton. Default = 10000
    regexpDFALimit: 20000
    # Limit on the max number of bytes used by the finite state automaton. Default 10mb (10 million)
    regexpFSALimit: 52428800
    # Likelihood that an incoming write is written to the next block when close to the block boundary
    forwardIndexProbability: 1.0   # всегда писать в прямой индекс
    # Threshold for forward writes, as a fraction of the given namespace's bufferFuture
    forwardIndexThreshold: 0.1     # писать, когда точка близка к границе блока
  client:
    # Consistency level for writes.
    writeConsistencyLevel: majority
    # Consistency level for reads.
    readConsistencyLevel: unstrict_majority
    # Timeout for writes.
    writeTimeout: 10s
    # Timeout for reads.
    fetchTimeout: 120s
    # Timeout for establishing a connection to the cluster.
    connectTimeout: 30s
    # Configuration for retrying writes.
    writeRetry:
        initialBackoff: 500ms
        backoffFactor: 2
        maxRetries: 3
        jitter: true
    # Configuration for retrying reads.
    fetchRetry:
        initialBackoff: 500ms
        backoffFactor: 1.5
        maxRetries: 3
        jitter: true
    # Number of times we background health check for a node can fail before
    # considering the node unhealthy.
    backgroundHealthCheckFailLimit: 3
    backgroundHealthCheckFailThrottleFactor: 0.7
  gcPercentage: 30
  bootstrap:
    commitlog:
      # Whether tail end of corrupted commit logs cause an error on bootstrap.
      returnUnfulfilledForCorruptCommitLogFiles: false
  cache:
    # Caching policy for database blocks.
    series:
      policy: recently_read
      lru:
        maxBlocks: 1000
        eventsChannelSize: 10000
    # PostingsList cache policy
    postingsList:
      size: 100000
      cacheRegexp: true
      cacheTerms: true
    # Compiled regexp cache for query regexp
    regexp:
      size: 1000
  commitlog:
    # Maximum number of bytes that will be buffered before flushing the commitlog.
    flushMaxBytes: 1048576
    # Maximum amount of time data can remain buffered before flushing the commitlog.
    flushEvery: 500ms
    # Configuration for the commitlog queue. High throughput setups may require higher
    # values. Higher values will use more memory.
    queue:
      calculationType: fixed
      size: 2097152
  #Block retrieval policy
  blockRetrieve:
    # Concurrency to fetch blocks from disk
    fetchConcurrency: 20
    # Globally enables/disables callbacks used to cache blocks fetched from disk
    cacheBlocksOnRetrieve: true
  filesystem:
    # Directory to store M3DB data in.
    filePathPrefix: /var/lib/m3db
    # Various fixed-sized buffers used for M3DB I/O.
    writeBufferSize: 65536
    dataReadBufferSize: 65536
    infoReadBufferSize: 128
    seekReadBufferSize: 4096
    # Maximum Mib/s that can be written to disk by background operations like flushing
    # and snapshotting to prevent them from interfering with the commitlog. Increasing
    # this value can make node adds significantly faster if the underlying disk can
    # support the throughput.
    throughputLimitMbps: 2000.0
    throughputCheckEvery: 128
    mmap:
      # Huge pages config (Linux only)
      hugeTLB:
        enabled: false
    # Forces the mmap that stores the index lookup bytes to be an anonymous region in memory
    force_index_summaries_mmap_memory: true
    # Forces the mmap that stores the bloom filter bytes to be an anonymous region in memory
    force_bloom_filter_mmap_memory: true

  discovery:
    # The type of discovery configuration used, valid options: [config, m3db_single_node, m3db_cluster, m3aggregator_cluster]
    type: config
    config:
      service:
        zone: embedded
        env: default_env
        service: m3db
        etcdClusters:
          - zone: embedded
            endpoints:
              - 127.0.0.1:2379
```

coordinator:

```
listenAddress: "0.0.0.0:7201"

query:
  defaultEngine: m3query
  timeout: 120s
  consolidation:
    matchType: ids
  prometheus:
    maxSamplesPerQuery: 500000000

clusters:
  - namespaces:
    client:
      config:
        service:
          env: default_env
          zone: embedded
          service: m3db
          cacheDir: /var/lib/m3kv
          etcdClusters:
            - zone: embedded
              endpoints:
                - 127.0.0.1:2379

clusterManagement:
  etcd:
    env: default_env
    zone: embedded
    service: m3coordinator
    cacheDir: /var/lib/m3kv
    etcdClusters:
      - zone: embedded
        endpoints:
          - 127.0.0.1:2379

downsample:
  remoteAggregator:
    client:
      type: m3msg
      m3msg:
        producer:
          writer:
            topicName: aggregator_ingest
            topicServiceOverride:
              zone: embedded
              environment: default_env
            placement:
              isStaged: true
            placementServiceOverride:
              namespaces:
                placement: /placement
            connection:
              numConnections: 4
            messagePool:
              size: 16384
              watermark:
                low: 0.2
                high: 0.5

#This is for configuring the ingestion server that will receive metrics from the m3aggregators on port 7507
ingest:
  ingester:
    workerPoolSize: 20000
    opPool:
      size: 20000
    retry:
      maxRetries: 5
      jitter: true
    logSampleRate: 0.01
  m3msg:
    server:
      listenAddress: "0.0.0.0:7507"
      retry:
        maxBackoff: 10s
        jitter: true
```

aggregator:

```
m3msg:
  server:
    listenAddress: 0.0.0.0:6000
    retry:
      maxBackoff: 10s
      jitter: true
  consumer:
    messagePool:
      size: 16384
      watermark:
        low: 0.2
        high: 0.5

http:
  listenAddress: 0.0.0.0:6001
  readTimeout: 60s
  writeTimeout: 60s

kvClient:
  etcd:
    env: default_env
    zone: embedded
    service: m3aggregator
    cacheDir: /var/lib/m3kv
    etcdClusters:
      - zone: embedded
        endpoints:
          - 127.0.0.1:2379

runtimeOptions:
  kvConfig:
    environment: default_env
    zone: embedded
  writeValuesPerMetricLimitPerSecondKey: write-values-per-metric-limit-per-second
  writeValuesPerMetricLimitPerSecond: 0
  writeNewMetricLimitClusterPerSecondKey: write-new-metric-limit-cluster-per-second
  writeNewMetricLimitClusterPerSecond: 0
  writeNewMetricNoLimitWarmupDuration: 0

aggregator:
  hostID:
    resolver: environment
    envVarName: M3AGGREGATOR_HOST_ID
  instanceID:
    type: host_id
  verboseErrors: true
  metricPrefix: ""
  counterPrefix: ""
  timerPrefix: ""
  gaugePrefix: ""
  aggregationTypes:
    counterTransformFnType: empty
    timerTransformFnType: suffix
    gaugeTransformFnType: empty
    aggregationTypesPool:
      size: 1024
    quantilesPool:
      buckets:
        - count: 256
          capacity: 4
        - count: 128
          capacity: 8
  stream:
    eps: 0.001
    capacity: 32
    streamPool:
      size: 4096
    samplePool:
      size: 4096
    floatsPool:
      buckets:
        - count: 4096
          capacity: 16
        - count: 2048
          capacity: 32
        - count: 1024
          capacity: 64
  client:
    type: m3msg
    m3msg:
      producer:
        writer:
          topicName: aggregator_ingest
          topicServiceOverride:
            zone: embedded
            environment: default_env
          placement:
            isStaged: true
          placementServiceOverride:
            namespaces:
              placement: /placement
          messagePool:
            size: 16384
            watermark:
              low: 0.2
              high: 0.5
  placementManager:
    kvConfig:
      namespace: /placement
      environment: default_env
      zone: embedded
    placementWatcher:
      key: m3aggregator
      initWatchTimeout: 10s
  hashType: murmur32
  bufferDurationBeforeShardCutover: 10m
  bufferDurationAfterShardCutoff: 10m
  bufferDurationForFutureTimedMetric: 10m # Allow test to write into future.
  resignTimeout: 1m
  flushTimesManager:
    kvConfig:
      environment: default_env
      zone: embedded
    flushTimesKeyFmt: shardset/%d/flush
    flushTimesPersistRetrier:
      initialBackoff: 100ms
      backoffFactor: 2.0
      maxBackoff: 2s
      maxRetries: 3
  electionManager:
    election:
      leaderTimeout: 10s
      resignTimeout: 10s
      ttlSeconds: 10
    serviceID:
      name: m3aggregator
      environment: default_env
      zone: embedded
    electionKeyFmt: shardset/%d/lock
    campaignRetrier:
      initialBackoff: 100ms
      backoffFactor: 2.0
      maxBackoff: 2s
      forever: true
      jitter: true
    changeRetrier:
      initialBackoff: 100ms
      backoffFactor: 2.0
      maxBackoff: 5s
      forever: true
      jitter: true
    resignRetrier:
      initialBackoff: 100ms
      backoffFactor: 2.0
      maxBackoff: 5s
      forever: true
      jitter: true
    campaignStateCheckInterval: 1s
    shardCutoffCheckOffset: 30s
  flushManager:
    checkEvery: 2s
    jitterEnabled: true
    maxJitters:
      - flushInterval: 5s
        maxJitterPercent: 1.0
      - flushInterval: 10s
        maxJitterPercent: 0.5
      - flushInterval: 1m
        maxJitterPercent: 0.5
      - flushInterval: 5m
        maxJitterPercent: 0.5
      - flushInterval: 10m
        maxJitterPercent: 0.5
      - flushInterval: 15m
        maxJitterPercent: 0.5
      - flushInterval: 1h
        maxJitterPercent: 0.25
    numWorkersPerCPU: 0.5
    flushTimesPersistEvery: 10s
    maxBufferSize: 60m
    forcedFlushWindowSize: 10s
  flush:
    handlers:
      - dynamicBackend:
          name: m3msg
          hashType: murmur32
          producer:
            writer:
              topicName: aggregated_metrics
              topicServiceOverride:
                zone: embedded
                environment: default_env
              messagePool:
                size: 16384
                watermark:
                  low: 0.2
                  high: 0.5
  passthrough:
    enabled: true
  forwarding:
    maxConstDelay: 5m # Need to add some buffer window, since timed metrics by default are delayed by 1min.
  entryTTL: 1h
  entryCheckInterval: 10m
  maxTimerBatchSizePerWrite: 140
  defaultStoragePolicies: []
  maxNumCachedSourceSets: 2
  discardNaNAggregatedValues: true
  entryPool:
    size: 4096
  counterElemPool:
    size: 4096
  timerElemPool:
    size: 4096
  gaugeElemPool:
    size: 4096
```

Next I created created placements, topics, namespaces:

```
curl -vvvsSf -H "Cluster-Environment-Name: default_env" -X POST http://localhost:7201/api/v1/services/m3aggregator/placement/init -d '{
    "num_shards": 1,
    "replication_factor": 1,
    "instances": [
        {
            "id": "m3aggregator",
            "isolation_group": "embedded",
            "zone": "embedded",
            "weight": 100,
            "endpoint": "127.0.0.1:6000",
            "hostname": "127.0.0.1",
            "port": 6000
        }
    ]
}'

curl -vvvsSf -H "Cluster-Environment-Name: default_env" -X POST http://localhost:7201/api/v1/services/m3coordinator/placement/init -d '{
    "instances": [
        {
            "id": "m3coordinator",
            "zone": "embedded",
            "endpoint": "127.0.0.1:7507",
            "hostname": "127.0.0.1",
            "port": 7507
        }
    ]
}'

curl -vvvsSf -H "Cluster-Environment-Name: default_env" -H "Topic-Name: aggregator_ingest" -X POST http://localhost:7201/api/v1/topic/init -d '{
    "numberOfShards": 1
}'

curl -vvvsSf -H "Cluster-Environment-Name: default_env" -H "Topic-Name: aggregator_ingest" -X POST http://localhost:7201/api/v1/topic -d '{
  "consumerService": {
    "serviceId": {
      "name": "m3aggregator",
      "environment": "default_env",
      "zone": "embedded"
    },
    "consumptionType": "REPLICATED",
    "messageTtlNanos": "300000000000"
  }
}'



curl -vvvsSf -H "Cluster-Environment-Name: default_env" -H "Topic-Name: aggregated_metrics" -X POST http://localhost:7201/api/v1/topic/init -d '{
    "numberOfShards": 1
}'


curl -vvvsSf -H "Cluster-Environment-Name: default_env" -H "Topic-Name: aggregated_metrics" -X POST http://localhost:7201/api/v1/topic -d '{
  "consumerService": {
    "serviceId": {
      "name": "m3coordinator",
      "environment": "default_env",
      "zone": "embedded"
    },
    "consumptionType": "SHARED",
    "messageTtlNanos": "300000000000"
  }
}'

curl -X POST http://localhost:7201/api/v1/database/create -d '{
  "type": "cluster",
  "namespaceName": "default",
  "retentionTime": "1h11m",
  "numShards": 1,
  "replicationFactor": 1,
  "hosts": [
    {
      "id": "m3db-node-0",
      "isolationGroup": "embedded",
      "zone": "embedded",
      "weight": 100,
      "port": 9000
    }
  ]
}' | jq .

curl -X POST http://localhost:7201/api/v1/database/create -d '{
  "type": "cluster",
  "namespaceName": "aggregated_5m_12h",
  "retentionTime": "12h11m"
}' | jq .

curl -X POST http://localhost:7201/api/v1/database/create -d '{
  "type": "cluster",
  "namespaceName": "aggregated_10m_24h",
  "retentionTime": "24h11m"
}' | jq .

curl -X POST http://localhost:7201/api/v1/database/create -d '{
  "type": "cluster",
  "namespaceName": "aggregated_15m_72h",
  "retentionTime": "72h11m"
}' | jq .

Enable downsampling for all namespaces, for example:

curl -X PUT http://localhost:7201/api/v1/services/m3db/namespace -d '{
  "name": "aggregated_15m_72h",
  "options": {
     "aggregationOptions": {
      "aggregations": [
       {
         "aggregated": true,
         "attributes": {
         "resolutionDuration": "15m",
         "downsampleOptions": { "all": true }
       }
      }
    ]
   }
  }
}' | jq .

All namespaces did do ready
curl -X POST http://localhost:7201/api/v1/services/m3db/namespace/ready -d '{
  "name": "default"
}' | jq .
```
### Problem

```{
  "placement": {
    "instances": {
      "m3aggregator": {
        "id": "m3aggregator",
        "isolationGroup": "embedded",
        "zone": "embedded",
        "weight": 100,
        "endpoint": "127.0.0.1:6000",
        "shards": [
          {
            "id": 0,
            "state": "INITIALIZING",
            "sourceId": "",
            "cutoverNanos": "1753175820000000000",
            "cutoffNanos": "0",
            "redirectToShardId": null
          }
        ],
        "shardSetId": 1,
        "hostname": "127.0.0.1",
        "port": 6000,
        "metadata": {
          "debugPort": 0
        }
      }
    },
    "replicaFactor": 1,
    "numShards": 1,
    "isSharded": true,
    "cutoverTime": "0",
    "isMirrored": true,
    "maxShardSetId": 1
  },
  "version": 1
}
```

There are metrics only in namespace default and aggregation_5m_12h
If aggregation is turned off, all namespaces worked

What am I doing wrong?


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Aggregation's shard stuck in INITIALIZING #4370

Configurations:

Problem

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Aggregation's shard stuck in INITIALIZING #4370

Description

Configurations:

Problem

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions