Grcov report - spawner.rs

1

// Copyright (C) Moondance Labs Ltd.

2

// This file is part of Tanssi.

3

4

// Tanssi is free software: you can redistribute it and/or modify

5

// it under the terms of the GNU General Public License as published by

6

// the Free Software Foundation, either version 3 of the License, or

7

// (at your option) any later version.

8

9

// Tanssi is distributed in the hope that it will be useful,

10

// but WITHOUT ANY WARRANTY; without even the implied warranty of

11

// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

12

// GNU General Public License for more details.

13

14

// You should have received a copy of the GNU General Public License

15

// along with Tanssi.  If not, see <http://www.gnu.org/licenses/>.

16

17

//! Container Chain Spawner

18

//!

19

//! Controls the starting and stopping of container chains.

20

//!

21

//! For more information about when the database is deleted, check the

22

//! [Keep db flowchart](https://raw.githubusercontent.com/moondance-labs/tanssi/master/docs/keep_db_flowchart.png)

23

24

use {

25

    crate::{

26

        cli::ContainerChainCli,

27

        monitor::{SpawnedContainer, SpawnedContainersMonitor},

28

        rpc::generate_rpc_builder::GenerateRpcBuilder,

29

        service::{

30

            start_node_impl_container, ContainerChainClient, MinimalContainerRuntimeApi,

31

            ParachainClient,

32

},

33

},

34

    cumulus_primitives_core::ParaId,

35

    cumulus_relay_chain_interface::RelayChainInterface,

36

    dancebox_runtime::{opaque::Block as OpaqueBlock, Block},

37

    dc_orchestrator_chain_interface::{OrchestratorChainInterface, PHash},

38

    frame_support::{CloneNoBound, DefaultNoBound},

39

    fs2::FileExt,

40

    futures::FutureExt,

41

    node_common::command::generate_genesis_block,

42

    polkadot_primitives::CollatorPair,

43

    sc_cli::{Database, SyncMode},

44

    sc_network::config::MultiaddrWithPeerId,

45

    sc_service::SpawnTaskHandle,

46

    sc_transaction_pool::TransactionPoolHandle,

47

    sp_core::H256,

48

    sp_keystore::KeystorePtr,

49

    sp_runtime::traits::Block as BlockT,

50

    std::{

51

        any::Any,

52

        collections::{HashMap, HashSet},

53

        marker::PhantomData,

54

        path::{Path, PathBuf},

55

        sync::{Arc, Mutex},

56

        time::Instant,

57

},

58

    tokio::{

59

        sync::{mpsc, oneshot},

60

        time::{sleep, Duration},

61

},

62

    tokio_util::sync::CancellationToken,

63

};

64

65

/// Timeout to wait for the database to close before starting it again, used in `wait_for_paritydb_lock`.

66

/// This is the max timeout, if the db is closed in 1 second then that function will only wait 1 second.

67

const MAX_DB_RESTART_TIMEOUT: Duration = Duration::from_secs(60);

68

69

/// Block diff threshold above which we decide it will be faster to delete the database and

70

/// use warp sync, rather than using full sync to download a large number of blocks.

71

/// This is only needed because warp sync does not support syncing from a state that is not

72

/// genesis, it falls back to full sync in that case.

73

/// 30_000 blocks = 50 hours at 6s/block.

74

/// Assuming a syncing speed of 100 blocks per second, this will take 5 minutes to sync.

75

const MAX_BLOCK_DIFF_FOR_FULL_SYNC: u32 = 30_000;

76

77

/// Task that handles spawning a stopping container chains based on assignment.

78

/// The main loop is [rx_loop](ContainerChainSpawner::rx_loop).

79

pub struct ContainerChainSpawner<

80

    RuntimeApi: MinimalContainerRuntimeApi,

81

    TGenerateRpcBuilder: GenerateRpcBuilder<RuntimeApi>,

82

> {

83

    /// Start container chain params

84

    pub params: ContainerChainSpawnParams<RuntimeApi, TGenerateRpcBuilder>,

85

86

    /// State

87

    pub state: Arc<Mutex<ContainerChainSpawnerState>>,

88

89

    /// Before the first assignment, there is a db cleanup process that removes folders of container

90

    /// chains that we are no longer assigned to.

91

    pub db_folder_cleanup_done: bool,

92

93

    /// Async callback that enables collation on the orchestrator chain

94

    pub collate_on_tanssi:

95

        Arc<dyn Fn() -> (CancellationToken, futures::channel::oneshot::Receiver<()>) + Send + Sync>,

96

    /// Stores the cancellation token used to stop the orchestrator chain collator process.

97

    /// When this is None, the orchestrator collator is not running.

98

    pub collation_cancellation_constructs:

99

        Option<(CancellationToken, futures::channel::oneshot::Receiver<()>)>,

100

101

102

/// Struct with all the params needed to start a container chain node given the CLI arguments,

103

/// and creating the ChainSpec from on-chain data from the orchestrator chain.

104

/// These params must be the same for all container chains, params that change such as the

105

/// `container_chain_para_id` should be passed as separate arguments to the [try_spawn] function.

106

///

107

/// This struct MUST NOT contain types (outside of `Option<CollationParams>`) obtained through

108

/// running an embeded orchestrator node, as this will prevent spawning a container chain in a node

109

/// connected to an orchestrator node through WebSocket.

110

#[derive(CloneNoBound)]

111

pub struct ContainerChainSpawnParams<

112

    RuntimeApi: MinimalContainerRuntimeApi,

113

    TGenerateRpcBuilder: GenerateRpcBuilder<RuntimeApi>,

114

> {

115

    pub orchestrator_chain_interface: Arc<dyn OrchestratorChainInterface>,

116

    pub container_chain_cli: ContainerChainCli,

117

    pub tokio_handle: tokio::runtime::Handle,

118

    pub chain_type: sc_chain_spec::ChainType,

119

    pub relay_chain: String,

120

    pub relay_chain_interface: Arc<dyn RelayChainInterface>,

121

    pub sync_keystore: KeystorePtr,

122

    pub spawn_handle: SpawnTaskHandle,

123

    pub collation_params: Option<CollationParams>,

124

    pub data_preserver: bool,

125

    pub generate_rpc_builder: TGenerateRpcBuilder,

126

    pub override_sync_mode: Option<SyncMode>,

127

128

    pub phantom: PhantomData<RuntimeApi>,

129

130

131

/// Params specific to collation. This struct can contain types obtained through running an

132

/// embeded orchestrator node.

133

#[derive(Clone)]

134

pub struct CollationParams {

135

    pub collator_key: CollatorPair,

136

    pub orchestrator_tx_pool: Option<Arc<TransactionPoolHandle<OpaqueBlock, ParachainClient>>>,

137

    pub orchestrator_client: Option<Arc<ParachainClient>>,

138

    pub orchestrator_para_id: ParaId,

139

    /// If this is `false`, then `orchestrator_tx_pool` and `orchestrator_client` must be `Some`.

140

    pub solochain: bool,

141

142

143

/// Mutable state for container chain spawner. Keeps track of running chains.

144

#[derive(DefaultNoBound)]

145

pub struct ContainerChainSpawnerState {

146

    spawned_container_chains: HashMap<ParaId, ContainerChainState>,

147

    assigned_para_id: Option<ParaId>,

148

    next_assigned_para_id: Option<ParaId>,

149

    failed_para_ids: HashSet<ParaId>,

150

    // For debugging and detecting errors

151

    pub spawned_containers_monitor: SpawnedContainersMonitor,

152

153

154

pub struct ContainerChainState {

155

    /// Handle that can be used to stop the container chain

156

    stop_handle: StopContainerChain,

157

    /// Database path

158

    db_path: PathBuf,

159

160

161

/// Stops a container chain when signal is sent. The bool means `keep_db`, whether to keep the

162

/// container chain database (true) or remove it (false).

163

pub struct StopContainerChain {

164

    signal: oneshot::Sender<bool>,

165

    id: usize,

166

167

168

/// Messages used to control the `ContainerChainSpawner`. This is needed because one of the fields

169

/// of `ContainerChainSpawner` is not `Sync`, so we cannot simply pass an

170

/// `Arc<ContainerChainSpawner>` to other threads.

171

#[derive(Debug)]

172

pub enum CcSpawnMsg {

173

    /// Update container chain assignment

174

    UpdateAssignment {

175

        current: Option<ParaId>,

176

        next: Option<ParaId>,

177

},

178

179

180

// Separate function to allow using `?` to return a result, and also to avoid using `self` in an

181

// async function. Mutable state should be written by locking `state`.

182

// TODO: `state` should be an async mutex

183

async fn try_spawn<

184

    RuntimeApi: MinimalContainerRuntimeApi,

185

    TGenerateRpcBuilder: GenerateRpcBuilder<RuntimeApi>,

186

>(

187

    try_spawn_params: ContainerChainSpawnParams<RuntimeApi, TGenerateRpcBuilder>,

188

    state: Arc<Mutex<ContainerChainSpawnerState>>,

189

    container_chain_para_id: ParaId,

190

    start_collation: bool,

191

) -> sc_service::error::Result<()> {

192

    let ContainerChainSpawnParams {

193

        orchestrator_chain_interface,

194

        mut container_chain_cli,

195

        tokio_handle,

196

        chain_type,

197

        relay_chain,

198

        relay_chain_interface,

199

        sync_keystore,

200

        spawn_handle,

201

        mut collation_params,

202

        data_preserver,

203

        generate_rpc_builder,

204

        override_sync_mode,

205

..

206

    } = try_spawn_params;

207

    // Preload genesis data from orchestrator chain storage.

208

209

    // TODO: the orchestrator chain node may not be fully synced yet,

210

    // in that case we will be reading an old state.

211

    let orchestrator_block_hash = orchestrator_chain_interface

212

        .finalized_block_hash()

213

        .await

214

        .map_err(|e| format!("Failed to get latest block hash: {e}"))?;

215

216

    log::info!(

217

        "Detected assignment for container chain {}",

218

        container_chain_para_id

219

);

220

221

    let genesis_data = orchestrator_chain_interface

222

        .genesis_data(orchestrator_block_hash, container_chain_para_id)

223

        .await

224

        .map_err(|e| format!("Failed to call genesis_data runtime api: {}", e))?

225

        .ok_or_else(|| {

226

            format!(

227

                "No genesis data registered for container chain id {}",

228

                container_chain_para_id

229

230

        })?;

231

232

    let boot_nodes_raw = orchestrator_chain_interface

233

        .boot_nodes(orchestrator_block_hash, container_chain_para_id)

234

        .await

235

        .map_err(|e| format!("Failed to call boot_nodes runtime api: {}", e))?;

236

237

    if boot_nodes_raw.is_empty() {

238

        log::warn!(

239

            "No boot nodes registered on-chain for container chain {}",

240

            container_chain_para_id

241

);

242

243

    let boot_nodes = parse_boot_nodes_ignore_invalid(boot_nodes_raw, container_chain_para_id);

244

    if boot_nodes.is_empty() {

245

        log::warn!(

246

            "No valid boot nodes for container chain {}",

247

            container_chain_para_id

248

);

249

250

251

    container_chain_cli

252

        .preload_chain_spec_from_genesis_data(

253

            container_chain_para_id.into(),

254

            genesis_data,

255

            chain_type.clone(),

256

            relay_chain.clone(),

257

            boot_nodes,

258

259

        .map_err(|e| {

260

            format!(

261

                "failed to create container chain chain spec from on chain genesis data: {}",

262

263

264

        })?;

265

266

    log::info!(

267

        "Loaded chain spec for container chain {}",

268

        container_chain_para_id

269

);

270

271

    if !data_preserver && !start_collation {

272

        log::info!("This is a syncing container chain, using random ports");

273

274

        collation_params = None;

275

276

        // Use random ports to avoid conflicts with the other running container chain

277

        let random_ports = [23456, 23457, 23458];

278

279

        container_chain_cli

280

            .base

281

            .base

282

            .prometheus_params

283

            .prometheus_port = Some(random_ports[0]);

284

        container_chain_cli.base.base.network_params.port = Some(random_ports[1]);

285

        container_chain_cli.base.base.rpc_params.rpc_port = Some(random_ports[2]);

286

287

        // Use a different network key for syncing the chain. This is to avoid full nodes banning collators

288

        // by mistake, with error:

289

        // Reason: Unsupported protocol. Banned, disconnecting.

290

//

291

        // Store this new key in a new path to not conflict with the real network key.

292

        // The same key is used for all container chains, that doesn't seem to cause problems.

293

294

        // Collator-01/data/containers

295

        let mut syncing_network_key_path = container_chain_cli

296

            .base

297

            .base

298

            .shared_params

299

            .base_path

300

            .clone()

301

            .expect("base path always set");

302

        // Collator-01/data/containers/keystore/network_syncing/secret_ed25519

303

        syncing_network_key_path.push("keystore/network_syncing/secret_ed25519");

304

305

        // Clear network key_params. These will be used by the collating process, but not by the syncing process.

306

        container_chain_cli

307

            .base

308

            .base

309

            .network_params

310

            .node_key_params

311

            .node_key = None;

312

        container_chain_cli

313

            .base

314

            .base

315

            .network_params

316

            .node_key_params

317

            .node_key_file = Some(syncing_network_key_path);

318

        // Generate a new network key if it has not been generated already.

319

        // This is safe to enable if your node is not an authority. We use it only for syncing the network.

320

        container_chain_cli

321

            .base

322

            .base

323

            .network_params

324

            .node_key_params

325

            .unsafe_force_node_key_generation = true;

326

327

328

    let validator = collation_params.is_some();

329

330

    // Update CLI params

331

    container_chain_cli.base.para_id = Some(container_chain_para_id.into());

332

    container_chain_cli

333

        .base

334

        .base

335

        .import_params

336

        .database_params

337

        .database = Some(Database::ParityDb);

338

339

    let keep_db = container_chain_cli.base.keep_db;

340

341

    // Get a closure that checks if db_path exists.Need this to know when to use full sync instead of warp sync.

342

    let check_db_exists = {

343

        // Get db_path from config

344

        let mut container_chain_cli_config = sc_cli::SubstrateCli::create_configuration(

345

            &container_chain_cli,

346

            &container_chain_cli,

347

            tokio_handle.clone(),

348

349

        .map_err(|err| format!("Container chain argument error: {}", err))?;

350

351

        // Change database path to make it depend on container chain para id

352

        // So instead of the usual "db/full" we have "db/full-container-2000"

353

        let mut db_path = container_chain_cli_config

354

            .database

355

            .path()

356

            .ok_or_else(|| "Failed to get database path".to_string())?

357

            .to_owned();

358

        db_path.set_file_name(format!("full-container-{}", container_chain_para_id));

359

        container_chain_cli_config.database.set_path(&db_path);

360

361

        // Return a closure because we may need to check if the db exists multiple times

362

        move || db_path.exists()

363

};

364

365

    // Start container chain node. After starting, check if the database is good or needs to

366

    // be removed. If the db needs to be removed, this function will handle the node restart, and

367

    // return the components of a running container chain node.

368

    // This should be a separate function, but it has so many arguments that I prefer to have it as a closure for now

369

    let start_node_impl_container_with_restart = || async move {

370

        // Loop will run at most 2 times: 1 time if the db is good and 2 times if the db needs to be removed

371

        for _ in 0..2 {

372

            let db_existed_before = check_db_exists();

373

374

            if let Some(sync) = override_sync_mode {

375

                container_chain_cli.base.base.network_params.sync = sync;

376

377

            log::info!(

378

                "Container chain sync mode: {:?}",

379

                container_chain_cli.base.base.network_params.sync

380

);

381

382

            let mut container_chain_cli_config = sc_cli::SubstrateCli::create_configuration(

383

                &container_chain_cli,

384

                &container_chain_cli,

385

                tokio_handle.clone(),

386

387

            .map_err(|err| format!("Container chain argument error: {}", err))?;

388

389

            // Change database path to make it depend on container chain para id

390

            // So instead of the usual "db/full" we have "db/full-container-2000"

391

            let mut db_path = container_chain_cli_config

392

                .database

393

                .path()

394

                .ok_or_else(|| "Failed to get database path".to_string())?

395

                .to_owned();

396

            db_path.set_file_name(format!("full-container-{}", container_chain_para_id));

397

            container_chain_cli_config.database.set_path(&db_path);

398

399

            let (container_chain_task_manager, container_chain_client, container_chain_db) =

400

                match container_chain_cli_config.network.network_backend {

401

                    sc_network::config::NetworkBackendType::Libp2p => {

402

                        start_node_impl_container::<_, _, sc_network::NetworkWorker<_, _>>(

403

                            container_chain_cli_config,

404

                            relay_chain_interface.clone(),

405

                            orchestrator_chain_interface.clone(),

406

                            sync_keystore.clone(),

407

                            container_chain_para_id,

408

                            collation_params.clone(),

409

                            generate_rpc_builder.clone(),

410

                            &container_chain_cli,

411

                            data_preserver,

412

413

                        .await?

414

415

                    sc_network::config::NetworkBackendType::Litep2p => {

416

                        start_node_impl_container::<_, _, sc_network::Litep2pNetworkBackend>(

417

                            container_chain_cli_config,

418

                            relay_chain_interface.clone(),

419

                            orchestrator_chain_interface.clone(),

420

                            sync_keystore.clone(),

421

                            container_chain_para_id,

422

                            collation_params.clone(),

423

                            generate_rpc_builder.clone(),

424

                            &container_chain_cli,

425

                            data_preserver,

426

427

                        .await?

428

429

};

430

431

            // Keep all node parts in one variable to make them easier to drop

432

            let node_parts = (

433

                container_chain_task_manager,

434

                container_chain_client,

435

                container_chain_db,

436

                db_path,

437

);

438

439

            if db_existed_before {

440

                // If the database already existed before, check if it can be used or it needs to be removed.

441

                // To remove the database, we restart the node, wait for the db to close to avoid a

442

                // "shutdown error" log, and then remove it.

443

                if let Some(db_removal_reason) = db_needs_removal(

444

                    &node_parts.1,

445

                    &orchestrator_chain_interface,

446

                    orchestrator_block_hash,

447

                    container_chain_para_id,

448

                    &container_chain_cli,

449

                    container_chain_cli.base.keep_db,

450

451

                .await?

452

453

                    let db_path = node_parts.3.clone();

454

                    // Important, drop `node_parts` before trying to `wait_for_paritydb_lock`

455

                    drop(node_parts);

456

                    // Wait here to for the database created in the previous loop iteration to close.

457

                    // Dropping is not enough because there is some background process that keeps the database open,

458

                    // so we check the paritydb lock file directly.

459

                    log::info!(

460

                        "Restarting container chain {} after db deletion. Reason: {:?}",

461

                        container_chain_para_id,

462

                        db_removal_reason,

463

);

464

                    wait_for_paritydb_lock(&db_path, MAX_DB_RESTART_TIMEOUT)

465

                        .await

466

                        .map_err(|e| {

467

                            log::warn!(

468

                                "Error waiting for chain {} to release db lock: {:?}",

469

                                container_chain_para_id,

470

471

);

472

473

474

                        })?;

475

                    delete_container_chain_db(&db_path);

476

477

                    // Recursion, will only happen once because `db_existed_before` will be false after

478

                    // removing the db. Apparently closures cannot be recursive so fake recursion by

479

                    // using a loop + continue

480

                    continue;

481

482

483

484

            // If using full sync, print a warning if the local db is at block 0 and the chain has thousands of blocks

485

            if container_chain_cli.base.base.network_params.sync == SyncMode::Full {

486

                let last_container_block_temp = node_parts.1.chain_info().best_number;

487

                let cc_block_num = get_latest_container_block_number_from_orchestrator(

488

                    &orchestrator_chain_interface,

489

                    orchestrator_block_hash,

490

                    container_chain_para_id,

491

492

                .await

493

                .unwrap_or(0);

494

                if last_container_block_temp == 0 && cc_block_num > MAX_BLOCK_DIFF_FOR_FULL_SYNC {

495

                    let db_folder = format!("full-container-{}", container_chain_para_id);

496

                    log::error!("\

497

                        Existing database for container chain {} is at block 0, assuming that warp sync failed.\n\

498

                        The node will now use full sync, which has to download {} blocks.\n\

499

                        If running as collator, it may not finish syncing on time and miss block rewards.\n\

500

                        To force using warp sync, stop tanssi-node and manually remove the db folder: {:?}\n\

501

                        ", container_chain_para_id, cc_block_num, db_folder)

502

503

504

505

            return sc_service::error::Result::Ok(node_parts);

506

507

508

        unreachable!("Above loop can run at most 2 times, and in the second iteration it is guaranteed to return")

509

};

510

511

    let (mut container_chain_task_manager, container_chain_client, container_chain_db, db_path) =

512

        start_node_impl_container_with_restart().await?;

513

514

    // Signal that allows to gracefully stop a container chain

515

    let (signal, on_exit) = oneshot::channel::<bool>();

516

517

    let monitor_id;

518

519

        let mut state = state.lock().expect("poison error");

520

        let container_chain_client = container_chain_client as Arc<dyn Any + Sync + Send>;

521

522

        monitor_id = state.spawned_containers_monitor.push(SpawnedContainer {

523

            id: 0,

524

            para_id: container_chain_para_id,

525

            start_time: Instant::now(),

526

            stop_signal_time: None,

527

            stop_task_manager_time: None,

528

            stop_refcount_time: Default::default(),

529

            backend: Arc::downgrade(&container_chain_db),

530

            client: Arc::downgrade(&container_chain_client),

531

});

532

533

        if state

534

            .spawned_container_chains

535

            .contains_key(&container_chain_para_id)

536

537

            return Err(format!("Tried to spawn a container chain when another container chain with the same para id was already running: {:?}", container_chain_para_id).into());

538

539

        state.spawned_container_chains.insert(

540

            container_chain_para_id,

541

            ContainerChainState {

542

                stop_handle: StopContainerChain {

543

                    signal,

544

                    id: monitor_id,

545

},

546

                db_path: db_path.clone(),

547

},

548

);

549

550

551

    // Add the container chain task manager as a child task to the parent task manager.

552

    // We want to stop the node if this task manager stops, but we also want to allow a

553

    // graceful shutdown using the `on_exit` future.

554

    let name = "container-chain-task-manager";

555

    spawn_handle.spawn(name, None, async move {

556

        let mut container_chain_task_manager_future =

557

            container_chain_task_manager.future().fuse();

558

        let mut on_exit_future = on_exit.fuse();

559

560

        futures::select! {

561

            res1 = container_chain_task_manager_future => {

562

                // An essential task failed or the task manager was stopped unexpectedly

563

                // using `.terminate()`. This should stop the container chain but not the node.

564

                if res1.is_err() {

565

                    log::error!("Essential task failed in container chain {} task manager. Shutting down container chain service", container_chain_para_id);

566

                } else {

567

                    log::error!("Unexpected shutdown in container chain {} task manager. Shutting down container chain service", container_chain_para_id);

568

569

                // Mark this container chain as "failed to stop" to avoid warning in `self.stop()`

570

                let mut state = state.lock().expect("poison error");

571

                state.failed_para_ids.insert(container_chain_para_id);

572

                // Never delete db in this case because it is not a graceful shutdown

573

574

            stop_unassigned = on_exit_future => {

575

                // Graceful shutdown.

576

                // `stop_unassigned` will be `Ok(keep_db)` if `.stop()` has been called, which means that the

577

                // container chain has been unassigned, and will be `Err` if the handle has been dropped,

578

                // which means that the node is stopping.

579

                // Delete existing database if running as collator

580

                if validator && stop_unassigned == Ok(false) && !keep_db {

581

                    // If this breaks after a code change, make sure that all the variables that

582

                    // may keep the chain alive are dropped before the call to `wait_for_paritydb_lock`.

583

                    drop(container_chain_task_manager_future);

584

                    drop(container_chain_task_manager);

585

                    let db_closed = wait_for_paritydb_lock(&db_path, MAX_DB_RESTART_TIMEOUT)

586

                        .await

587

                        .map_err(|e| {

588

                            log::warn!(

589

                                "Error waiting for chain {} to release db lock: {:?}",

590

                                container_chain_para_id,

591

592

);

593

                        }).is_ok();

594

                    // If db has not closed in 60 seconds we do not delete it.

595

                    if db_closed {

596

                        delete_container_chain_db(&db_path);

597

598

599

600

601

602

        let mut state = state.lock().expect("poison error");

603

        state

604

            .spawned_containers_monitor

605

            .set_stop_task_manager_time(monitor_id, Instant::now());

606

});

607

608

    Ok(())

609

610

611

/// Interface for spawning and stopping container chain embeded nodes.

612

pub trait Spawner {

613

    /// Access to the Orchestrator Chain Interface

614

    fn orchestrator_chain_interface(&self) -> Arc<dyn OrchestratorChainInterface>;

615

616

    /// Try to start a new container chain. In case of an error, this does not stop the node, and

617

    /// the container chain will be attempted to spawn again when the collator is reassigned to it.

618

///

619

    /// It is possible that we try to spawn-stop-spawn the same chain, and the second spawn fails

620

    /// because the chain has not stopped yet, because `stop` does not wait for the chain to stop,

621

    /// so before calling `spawn` make sure to call `wait_for_paritydb_lock` before, like we do in

622

    /// `handle_update_assignment`.

623

    fn spawn(

624

        &self,

625

        container_chain_para_id: ParaId,

626

        start_collation: bool,

627

    ) -> impl std::future::Future<Output = ()> + Send;

628

629

    /// Stop a container chain. Prints a warning if the container chain was not running.

630

    /// Returns the database path for the container chain, can be used with `wait_for_paritydb_lock`

631

    /// to ensure that the container chain has fully stopped. The database path can be `None` if the

632

    /// chain was not running.

633

    fn stop(&self, container_chain_para_id: ParaId, keep_db: bool) -> Option<PathBuf>;

634

635

636

impl<

637

        RuntimeApi: MinimalContainerRuntimeApi,

638

        TGenerateRpcBuilder: GenerateRpcBuilder<RuntimeApi>,

639

    > Spawner for ContainerChainSpawner<RuntimeApi, TGenerateRpcBuilder>

640

641

    /// Access to the Orchestrator Chain Interface

642

    fn orchestrator_chain_interface(&self) -> Arc<dyn OrchestratorChainInterface> {

643

        self.params.orchestrator_chain_interface.clone()

644

645

646

    /// Try to start a new container chain. In case of an error, this does not stop the node, and

647

    /// the container chain will be attempted to spawn again when the collator is reassigned to it.

648

///

649

    /// It is possible that we try to spawn-stop-spawn the same chain, and the second spawn fails

650

    /// because the chain has not stopped yet, because `stop` does not wait for the chain to stop,

651

    /// so before calling `spawn` make sure to call `wait_for_paritydb_lock` before, like we do in

652

    /// `handle_update_assignment`.

653

    async fn spawn(&self, container_chain_para_id: ParaId, start_collation: bool) {

654

        let try_spawn_params = self.params.clone();

655

        let state = self.state.clone();

656

        let state2 = state.clone();

657

658

        match try_spawn(

659

            try_spawn_params,

660

            state,

661

            container_chain_para_id,

662

            start_collation,

663

664

        .await

665

666

            Ok(()) => {}

667

            Err(e) => {

668

                log::error!(

669

                    "Failed to start container chain {}: {}",

670

                    container_chain_para_id,

671

672

);

673

                // Mark this container chain as "failed to start"

674

                let mut state = state2.lock().expect("poison error");

675

                state.failed_para_ids.insert(container_chain_para_id);

676

677

678

679

680

    /// Stop a container chain. Prints a warning if the container chain was not running.

681

    /// Returns the database path for the container chain, can be used with `wait_for_paritydb_lock`

682

    /// to ensure that the container chain has fully stopped. The database path can be `None` if the

683

    /// chain was not running.

684

    fn stop(&self, container_chain_para_id: ParaId, keep_db: bool) -> Option<PathBuf> {

685

        let mut state = self.state.lock().expect("poison error");

686

        let stop_handle = state

687

            .spawned_container_chains

688

            .remove(&container_chain_para_id);

689

690

        match stop_handle {

691

            Some(stop_handle) => {

692

                log::info!("Stopping container chain {}", container_chain_para_id);

693

694

                let id = stop_handle.stop_handle.id;

695

                state

696

                    .spawned_containers_monitor

697

                    .set_stop_signal_time(id, Instant::now());

698

699

                // Send signal to perform graceful shutdown, which will delete the db if needed

700

                let _ = stop_handle.stop_handle.signal.send(keep_db);

701

702

                Some(stop_handle.db_path)

703

704

            None => {

705

                // Do not print the warning message if this is a container chain that has failed to

706

                // start, because in that case it will not be running

707

                if !state.failed_para_ids.remove(&container_chain_para_id) {

708

                    log::warn!(

709

                        "Tried to stop a container chain that is not running: {}",

710

                        container_chain_para_id

711

);

712

713

714

                None

715

716

717

718

719

720

impl<

721

        RuntimeApi: MinimalContainerRuntimeApi,

722

        TGenerateRpcBuilder: GenerateRpcBuilder<RuntimeApi>,

723

    > ContainerChainSpawner<RuntimeApi, TGenerateRpcBuilder>

724

725

    /// Receive and process `CcSpawnMsg`s indefinitely

726

    pub async fn rx_loop(

727

        mut self,

728

        mut rx: mpsc::UnboundedReceiver<CcSpawnMsg>,

729

        validator: bool,

730

        solochain: bool,

731

) {

732

        let orchestrator_para_id = self

733

            .params

734

            .collation_params

735

            .as_ref()

736

            .expect("assignment update should only occur in a collating node")

737

            .orchestrator_para_id;

738

739

        // The node always starts as an orchestrator chain collator.

740

        // This is because the assignment is detected after importing a new block, so if all

741

        // collators stop at the same time, when they start again nobody will produce the new block.

742

        // So all nodes start as orchestrator chain collators, until the first block is imported,

743

        // then the real assignment is used.

744

        // Except in solochain mode, then the initial assignment is None.

745

        if validator && !solochain {

746

            self.handle_update_assignment(Some(orchestrator_para_id), None, true)

747

                .await;

748

749

750

        while let Some(msg) = rx.recv().await {

751

            match msg {

752

                CcSpawnMsg::UpdateAssignment { current, next } => {

753

                    self.handle_update_assignment(current, next, false).await;

754

755

756

757

758

        // The while loop can end if all the senders get dropped, but since this is an

759

        // essential task we don't want it to stop. So await a future that never completes.

760

        // This should only happen when starting a full node.

761

        if !validator {

762

            let () = std::future::pending().await;

763

764

765

766

    /// Handle `CcSpawnMsg::UpdateAssignment`

767

    async fn handle_update_assignment(

768

        &mut self,

769

        current: Option<ParaId>,

770

        next: Option<ParaId>,

771

        disable_db_folder_cleanup: bool,

772

) {

773

        if !disable_db_folder_cleanup && !self.db_folder_cleanup_done {

774

            self.db_folder_cleanup_done = true;

775

776

            // Disabled when running with --keep-db

777

            let keep_db = self.params.container_chain_cli.base.keep_db;

778

            if !keep_db {

779

                let mut chains_to_keep = HashSet::new();

780

                chains_to_keep.extend(current);

781

                chains_to_keep.extend(next);

782

                self.db_folder_cleanup(&chains_to_keep);

783

784

785

786

        let orchestrator_para_id = self

787

            .params

788

            .collation_params

789

            .as_ref()

790

            .expect("assignment update should only occur in a collating node")

791

            .orchestrator_para_id;

792

793

        let HandleUpdateAssignmentResult {

794

            chains_to_stop,

795

            chains_to_start,

796

            need_to_restart: _,

797

        } = handle_update_assignment_state_change(

798

            &mut self.state.lock().expect("poison error"),

799

            orchestrator_para_id,

800

            current,

801

            next,

802

);

803

804

        if current != Some(orchestrator_para_id) {

805

            // If not assigned to orchestrator chain anymore, we need to stop the collator process

806

            let maybe_exit_notification_receiver = self

807

                .collation_cancellation_constructs

808

                .take()

809

                .map(|(cancellation_token, exit_notification_receiver)| {

810

                    cancellation_token.cancel();

811

                    exit_notification_receiver

812

});

813

814

            if let Some(exit_notification_receiver) = maybe_exit_notification_receiver {

815

                let _ = exit_notification_receiver.await;

816

817

        } else if self.collation_cancellation_constructs.is_none() {

818

            // If assigned to orchestrator chain but the collator process is not running, start it

819

            self.collation_cancellation_constructs = Some((self.collate_on_tanssi)());

820

821

822

        // Stop all container chains that are no longer needed

823

        let mut db_paths_restart = vec![];

824

        for para_id in chains_to_stop {

825

            // Keep db if we are currently assigned to this chain

826

            let keep_db = Some(para_id) == current;

827

            let maybe_db_path = self.stop(para_id, keep_db);

828

            // If we are restarting this chain, save its db_path to check when it actually stopped

829

            if let Some(db_path) = maybe_db_path {

830

                if chains_to_start.contains(&para_id) {

831

                    db_paths_restart.push((para_id, db_path));

832

833

834

835

836

        if !db_paths_restart.is_empty() {

837

            // Ensure the chains we stopped actually stopped by checking if their database is unlocked.

838

            // Using `join_all` because in one edge case we may be restarting 2 chains,

839

            // but almost always this will be only one future.

840

            let futs = db_paths_restart

841

                .into_iter()

842

                .map(|(para_id, db_path)| async move {

843

                    wait_for_paritydb_lock(&db_path, MAX_DB_RESTART_TIMEOUT)

844

                        .await

845

                        .map_err(|e| {

846

                            log::warn!(

847

                                "Error waiting for chain {} to release db lock: {:?}",

848

                                para_id,

849

850

);

851

})

852

});

853

            futures::future::join_all(futs).await;

854

855

856

        // Start all new container chains (usually 1)

857

        for para_id in chains_to_start {

858

            // Edge case: when starting the node it may be assigned to a container chain, so we need to

859

            // start a container chain already collating.

860

            // TODO: another edge case: if current == None, and running_chains == 0,

861

            // and chains_to_start == 1, we can start this chain as collating, and we won't need

862

            // to restart it on the next session. We need to add some extra state somewhere to

863

            // implement this properly.

864

            let start_collation = Some(para_id) == current;

865

            self.spawn(para_id, start_collation).await;

866

867

868

869

    fn db_folder_cleanup(&self, chains_to_keep: &HashSet<ParaId>) {

870

        // "containers" folder

871

        let mut base_path = self

872

            .params

873

            .container_chain_cli

874

            .base

875

            .base

876

            .shared_params

877

            .base_path

878

            .as_ref()

879

            .expect("base_path is always set")

880

            .to_owned();

881

882

        // "containers/chains"

883

        base_path.push("chains");

884

885

        // Inside chains folder we have container folders such as

886

        // containers/chains/simple_container_2000/

887

        // containers/chains/frontier_container_2001/

888

        // But this is not the para id, it's the chain id which we have set to include the para id, but that's not mandatory.

889

        // To get the para id we need to look for the paritydb folder:

890

        // containers/chains/frontier_container_2001/paritydb/full-container-2001/

891

        let mut chain_folders = sort_container_folders_by_para_id(&base_path);

892

893

        // Keep chains that we are assigned to

894

        for para_id in chains_to_keep {

895

            chain_folders.remove(&Some(*para_id));

896

897

898

        // Print nice log message when removing folders

899

        if !chain_folders.is_empty() {

900

            let chain_folders_fmt = chain_folders

901

                .iter()

902

                .flat_map(|(para_id, vec_paths)| {

903

                    let para_id_fmt = if let Some(para_id) = para_id {

904

                        para_id.to_string()

905

                    } else {

906

                        "None".to_string()

907

};

908

                    vec_paths

909

                        .iter()

910

                        .map(move |path| format!("\n{}: {}", para_id_fmt, path.display()))

911

})

912

                .collect::<String>();

913

            log::info!(

914

                "db_folder_cleanup: removing container folders: (para_id, path):{}",

915

                chain_folders_fmt

916

);

917

918

919

        // Remove, ignoring errors

920

        for (_para_id, folders) in chain_folders {

921

            for folder in folders {

922

                let _ = std::fs::remove_dir_all(&folder);

923

924

925

926

927

928

struct HandleUpdateAssignmentResult {

929

    chains_to_stop: Vec<ParaId>,

930

    chains_to_start: Vec<ParaId>,

931

    #[allow(dead_code)] // no longer used except in tests

932

    need_to_restart: bool,

933

934

935

// This is a separate function to allow testing

936

35

fn handle_update_assignment_state_change(

937

35

    state: &mut ContainerChainSpawnerState,

938

35

    orchestrator_para_id: ParaId,

939

35

    current: Option<ParaId>,

940

35

    next: Option<ParaId>,

941

35

) -> HandleUpdateAssignmentResult {

942

35

    if (state.assigned_para_id, state.next_assigned_para_id) == (current, next) {

943

        // If nothing changed there is nothing to update

944

        return HandleUpdateAssignmentResult {

945

            chains_to_stop: Default::default(),

946

            chains_to_start: Default::default(),

947

            need_to_restart: false,

948

};

949

35

950

951

    // Create a set with the container chains that were running before, and the container

952

    // chains that should be running after the updated assignment. This is used to calculate

953

    // the difference, and stop and start the required container chains.

954

35

    let mut running_chains_before = HashSet::new();

955

35

    let mut running_chains_after = HashSet::new();

956

957

35

    running_chains_before.extend(state.assigned_para_id);

958

35

    running_chains_before.extend(state.next_assigned_para_id);

959

    // Ignore orchestrator_para_id because it is handled in a special way, as it does not need to

960

    // start one session before in order to sync.

961

35

    running_chains_before.remove(&orchestrator_para_id);

962

963

35

    running_chains_after.extend(current);

964

35

    running_chains_after.extend(next);

965

35

    running_chains_after.remove(&orchestrator_para_id);

966

35

    let mut need_to_restart_current = false;

967

35

    let mut need_to_restart_next = false;

968

969

35

    if state.assigned_para_id != current {

970

24

        if let Some(para_id) = current {

971

            // If the assigned container chain has changed, we may need to

972

            // restart it in collation mode, unless it is the orchestrator chain.

973

16

            if para_id != orchestrator_para_id {

974

13

                need_to_restart_current = true;

975

13

976

8

977

978

24

        if let Some(para_id) = state.assigned_para_id {

979

18

            if para_id != orchestrator_para_id && Some(para_id) == next {

980

2

                need_to_restart_next = true;

981

16

982

6

983

11

984

985

35

    state.assigned_para_id = current;

986

35

    state.next_assigned_para_id = next;

987

988

35

    let mut chains_to_stop: Vec<_> = running_chains_before

989

35

        .difference(&running_chains_after)

990

35

        .copied()

991

35

        .collect();

992

35

    let mut chains_to_start: Vec<_> = running_chains_after

993

35

        .difference(&running_chains_before)

994

35

        .copied()

995

35

        .collect();

996

997

35

    if need_to_restart_current {

998

        // Force restart of new assigned container chain: if it was running before it was in "syncing mode",

999

        // which doesn't use the correct ports, so start it in "collation mode".

1000

13

        let id = current.unwrap();

1001

13

        if running_chains_before.contains(&id) && !chains_to_stop.contains(&id) {

1002

6

            chains_to_stop.push(id);

1003

7

1004

13

        if !chains_to_start.contains(&id) {

1005

6

            chains_to_start.push(id);

1006

7

1007

22

1008

1009

35

    if need_to_restart_next {

1010

        // Handle edge case of going from (2000, 2001) to (2001, 2000). In that case we must restart both chains,

1011

        // because previously 2000 was collating and now 2000 will only be syncing.

1012

2

        let id = next.unwrap();

1013

2

        if running_chains_before.contains(&id) && !chains_to_stop.contains(&id) {

1014

2

            chains_to_stop.push(id);

1015

2

1016

2

        if !chains_to_start.contains(&id) {

1017

2

            chains_to_start.push(id);

1018

2

1019

33

1020

1021

    HandleUpdateAssignmentResult {

1022

35

        chains_to_stop,

1023

35

        chains_to_start,

1024

35

        need_to_restart: need_to_restart_current || need_to_restart_next,

1025

1026

35

1027

1028

async fn get_latest_container_block_number_from_orchestrator(

1029

    orchestrator_chain_interface: &Arc<dyn OrchestratorChainInterface>,

1030

    orchestrator_block_hash: PHash,

1031

    container_chain_para_id: ParaId,

1032

) -> Option<u32> {

1033

    // Get the container chain's latest block from orchestrator chain and compare with client's one

1034

    orchestrator_chain_interface

1035

        .latest_block_number(orchestrator_block_hash, container_chain_para_id)

1036

        .await

1037

        .unwrap_or_default()

1038

1039

1040

#[derive(Debug)]

1041

#[allow(dead_code)]

1042

enum DbRemovalReason {

1043

    HighBlockDiff {

1044

        best_block_number_db: u32,

1045

        best_block_number_onchain: u32,

1046

},

1047

    GenesisHashMismatch {

1048

        container_client_genesis_hash: H256,

1049

        chain_spec_genesis_hash_v0: H256,

1050

        chain_spec_genesis_hash_v1: H256,

1051

},

1052

1053

1054

/// Given a container chain client, check if the database is valid. If not, returns `Some` with the

1055

/// reason for db removal.

1056

/// Reasons may be:

1057

/// * High block diff: when the local db is outdated and it would take a long time to sync using full sync, we remove it to be able to use warp sync.

1058

/// * Genesis hash mismatch, when the chain was deregistered and a different chain with the same para id was registered.

1059

async fn db_needs_removal<RuntimeApi: MinimalContainerRuntimeApi>(

1060

    container_chain_client: &Arc<ContainerChainClient<RuntimeApi>>,

1061

    orchestrator_chain_interface: &Arc<dyn OrchestratorChainInterface>,

1062

    orchestrator_block_hash: PHash,

1063

    container_chain_para_id: ParaId,

1064

    container_chain_cli: &ContainerChainCli,

1065

    keep_db: bool,

1066

) -> sc_service::error::Result<Option<DbRemovalReason>> {

1067

    // Check block diff, only needed if keep-db is false

1068

    if !keep_db {

1069

        // Get latest block number from the container chain client

1070

        let last_container_block_temp = container_chain_client.chain_info().best_number;

1071

        if last_container_block_temp == 0 {

1072

            // Don't remove an empty database, as it may be in the process of a warp sync

1073

        } else if get_latest_container_block_number_from_orchestrator(

1074

            orchestrator_chain_interface,

1075

            orchestrator_block_hash,

1076

            container_chain_para_id,

1077

1078

        .await

1079

        .unwrap_or(0)

1080

        .abs_diff(last_container_block_temp)

1081

            > MAX_BLOCK_DIFF_FOR_FULL_SYNC

1082

1083

            // if the diff is big, delete db and restart using warp sync

1084

            return Ok(Some(DbRemovalReason::HighBlockDiff {

1085

                best_block_number_db: last_container_block_temp,

1086

                best_block_number_onchain: last_container_block_temp,

1087

            }));

1088

1089

1090

1091

    // Generate genesis hash to compare against container client's genesis hash

1092

    let container_preloaded_genesis = container_chain_cli.preloaded_chain_spec.as_ref().unwrap();

1093

1094

    // Check with both state versions, but first v1 which is the latest

1095

    let block_v1: Block =

1096

        generate_genesis_block(&**container_preloaded_genesis, sp_runtime::StateVersion::V1)

1097

            .map_err(|e| format!("{:?}", e))?;

1098

    let chain_spec_genesis_hash_v1 = block_v1.header().hash();

1099

1100

    let container_client_genesis_hash = container_chain_client.chain_info().genesis_hash;

1101

1102

    if container_client_genesis_hash != chain_spec_genesis_hash_v1 {

1103

        let block_v0: Block =

1104

            generate_genesis_block(&**container_preloaded_genesis, sp_runtime::StateVersion::V0)

1105

                .map_err(|e| format!("{:?}", e))?;

1106

        let chain_spec_genesis_hash_v0 = block_v0.header().hash();

1107

1108

        if container_client_genesis_hash != chain_spec_genesis_hash_v0 {

1109

            log::info!("Container genesis V0: {:?}", chain_spec_genesis_hash_v0);

1110

            log::info!("Container genesis V1: {:?}", chain_spec_genesis_hash_v1);

1111

            log::info!(

1112

                "Chain spec genesis {:?} did not match with any container genesis - Restarting...",

1113

                container_client_genesis_hash

1114

);

1115

            return Ok(Some(DbRemovalReason::GenesisHashMismatch {

1116

                container_client_genesis_hash,

1117

                chain_spec_genesis_hash_v0,

1118

                chain_spec_genesis_hash_v1,

1119

            }));

1120

1121

1122

1123

    Ok(None)

1124

1125

1126

/// Remove the container chain database folder. This is called with db_path:

1127

///     `Collator2002-01/data/containers/chains/simple_container_2002/paritydb/full-container-2002`

1128

/// but we want to delete everything under

1129

///     `Collator2002-01/data/containers/chains/simple_container_2002`

1130

/// So we use `delete_empty_folders_recursive` to try to remove the parent folders as well, but only

1131

/// if they are empty. This is to avoid removing any secret keys or other important data.

1132

fn delete_container_chain_db(db_path: &Path) {

1133

    // Remove folder `full-container-2002`

1134

    let _ = std::fs::remove_dir_all(db_path);

1135

    // Remove all the empty folders inside `simple_container_2002`, including self

1136

    if let Some(parent) = db_path.ancestors().nth(2) {

1137

        delete_empty_folders_recursive(parent);

1138

1139

1140

1141

/// Removes all empty folders in `path`, recursively. Then, if `path` is empty, it removes it as well.

1142

/// Ignores any IO errors.

1143

fn delete_empty_folders_recursive(path: &Path) {

1144

    let entry_iter = std::fs::read_dir(path);

1145

    let entry_iter = match entry_iter {

1146

        Ok(x) => x,

1147

        Err(_e) => return,

1148

};

1149

1150

    for entry in entry_iter {

1151

        let entry = match entry {

1152

            Ok(x) => x,

1153

            Err(_e) => continue,

1154

};

1155

1156

        let path = entry.path();

1157

        if path.is_dir() {

1158

            delete_empty_folders_recursive(&path);

1159

1160

1161

1162

    // Try to remove dir. Returns an error if the directory is not empty, but we ignore it.

1163

    let _ = std::fs::remove_dir(path);

1164

1165

1166

/// Parse a list of boot nodes in `Vec<u8>` format. Invalid boot nodes are filtered out.

1167

3

fn parse_boot_nodes_ignore_invalid(

1168

3

    boot_nodes_raw: Vec<Vec<u8>>,

1169

3

    container_chain_para_id: ParaId,

1170

3

) -> Vec<MultiaddrWithPeerId> {

1171

3

    boot_nodes_raw

1172

3

        .into_iter()

1173

3

        .filter_map(|x| {

1174

3

            let x = String::from_utf8(x)

1175

3

                .map_err(|e| {

1176

1

                    log::debug!(

1177

                        "Invalid boot node in container chain {}: {}",

1178

                        container_chain_para_id,

1179

1180

);

1181

1

})

1182

3

                .ok()?;

1183

1184

2

            x.parse::<MultiaddrWithPeerId>()

1185

2

                .map_err(|e| {

1186

1

                    log::debug!(

1187

                        "Invalid boot node in container chain {}: {}",

1188

                        container_chain_para_id,

1189

1190

1191

1

})

1192

2

                .ok()

1193

3

})

1194

3

        .collect()

1195

3

1196

1197

pub async fn wait_for_paritydb_lock(db_path: &Path, max_timeout: Duration) -> Result<(), String> {

1198

    let now = Instant::now();

1199

1200

    while now.elapsed() < max_timeout {

1201

        let lock_held = check_paritydb_lock_held(db_path)

1202

            .map_err(|e| format!("Failed to check if lock file is held: {}", e))?;

1203

        if !lock_held {

1204

            return Ok(());

1205

1206

        sleep(Duration::from_secs(1)).await;

1207

1208

1209

    Err("Timeout when waiting for paritydb lock".to_string())

1210

1211

1212

/// Given a path to a paritydb database, check if its lock file is held. This indicates that a

1213

/// background process is still using the database, so we should wait before trying to open it.

1214

///

1215

/// This should be kept up to date with the way paritydb handles the lock file:

1216

/// <https://github.com/paritytech/parity-db/blob/2b6820e310a08678d4540c044f41a93d87343ac8/src/db.rs#L215>

1217

fn check_paritydb_lock_held(db_path: &Path) -> Result<bool, std::io::Error> {

1218

    if !db_path.is_dir() {

1219

        // Lock file does not exist, so it is not held

1220

        return Ok(false);

1221

1222

1223

    let mut lock_path: std::path::PathBuf = db_path.to_owned();

1224

    lock_path.push("lock");

1225

    let lock_file = std::fs::OpenOptions::new()

1226

        .create(true)

1227

        .read(true)

1228

        .write(true)

1229

        .truncate(true)

1230

        .open(lock_path.as_path())?;

1231

    // Check if the lock file is busy by trying to lock it.

1232

    // Returns err if failed to adquire the lock.

1233

    let lock_held = lock_file.try_lock_exclusive().is_err();

1234

1235

    Ok(lock_held)

1236

1237

1238

fn sort_container_folders_by_para_id(

1239

    chains_folder_path: &Path,

1240

) -> HashMap<Option<ParaId>, Vec<PathBuf>> {

1241

    let mut h = HashMap::new();

1242

1243

    let entry_iter = std::fs::read_dir(chains_folder_path);

1244

    let entry_iter = match entry_iter {

1245

        Ok(x) => x,

1246

        Err(_e) => return h,

1247

};

1248

1249

    for entry in entry_iter {

1250

        let entry = match entry {

1251

            Ok(x) => x,

1252

            Err(_e) => continue,

1253

};

1254

1255

        let path = entry.path();

1256

        if path.is_dir() {

1257

            if let Ok(para_id) = process_container_folder_get_para_id(path.clone()) {

1258

                h.entry(para_id).or_default().push(path);

1259

1260

1261

1262

1263

1264

1265

1266

fn process_container_folder_get_para_id(path: PathBuf) -> std::io::Result<Option<ParaId>> {

1267

    // Build the path to the paritydb directory

1268

    let paritydb_path = path.join("paritydb");

1269

1270

    // Check if the paritydb directory exists and is a directory

1271

    if !paritydb_path.is_dir() {

1272

        // If not, associate the path with `None` in the hashmap

1273

        return Ok(None);

1274

1275

1276

    // Read the entries in the paritydb directory

1277

    let entry_iter = std::fs::read_dir(&paritydb_path)?;

1278

1279

    let mut para_id: Option<ParaId> = None;

1280

1281

    // Iterate over each entry in the paritydb directory

1282

    for entry in entry_iter {

1283

        let entry = entry?;

1284

        let sub_path = entry.path();

1285

1286

        // Only consider directories

1287

        if !sub_path.is_dir() {

1288

            continue;

1289

1290

1291

        let sub_path_file_name = match sub_path.file_name().and_then(|s| s.to_str()) {

1292

            Some(x) => x,

1293

            None => {

1294

                continue;

1295

1296

};

1297

1298

        // That follow this pattern

1299

        if !sub_path_file_name.starts_with("full-container-") {

1300

            continue;

1301

1302

1303

        if let Some(id) = parse_para_id_from_folder_name(sub_path_file_name) {

1304

            if para_id.is_some() {

1305

                // If there is more than one folder with a para id, assume this folder is

1306

                // corrupted and ignore it, keep it for manual deletion

1307

                return Err(std::io::Error::new(std::io::ErrorKind::Other, ""));

1308

1309

            para_id = Some(id);

1310

1311

1312

1313

    Ok(para_id)

1314

1315

1316

// Input:

1317

// full-container-2000

1318

// Output:

1319

// Some(2000)

1320

5

fn parse_para_id_from_folder_name(folder_name: &str) -> Option<ParaId> {

1321

    // Find last '-' in string

1322

5

    let idx = folder_name.rfind('-')?;

1323

    // +1 to skip the '-'

1324

3

    let id_str = &folder_name[idx + 1..];

1325

    // Try to parse as u32, in case of error return None

1326

3

    let id = id_str.parse::<u32>().ok()?;

1327

1328

1

    Some(id.into())

1329

5

1330

1331

#[cfg(test)]

1332

mod tests {

1333

    use {super::*, std::path::PathBuf};

1334

1335

    // Copy of ContainerChainSpawner with extra assertions for tests, and mocked spawn function.

1336

    struct MockContainerChainSpawner {

1337

        state: Arc<Mutex<ContainerChainSpawnerState>>,

1338

        orchestrator_para_id: ParaId,

1339

        collate_on_tanssi: Arc<

1340

            dyn Fn() -> (CancellationToken, futures::channel::oneshot::Receiver<()>) + Send + Sync,

1341

>,

1342

        collation_cancellation_constructs: Option<()>,

1343

        // Keep track of the last CollateOn message, for tests

1344

        currently_collating_on: Arc<Mutex<Option<ParaId>>>,

1345

1346

1347

    impl MockContainerChainSpawner {

1348

10

        fn new() -> Self {

1349

10

            let orchestrator_para_id = 1000.into();

1350

            // The node always starts as an orchestrator chain collator

1351

10

            let currently_collating_on = Arc::new(Mutex::new(Some(orchestrator_para_id)));

1352

10

            let currently_collating_on2 = currently_collating_on.clone();

1353

10

            let collate_closure = move || {

1354

3

                let mut cco = currently_collating_on2.lock().unwrap();

1355

3

                assert_ne!(

1356

3

                    *cco,

1357

3

                    Some(orchestrator_para_id),

1358

                    "Received CollateOn message when we were already collating on this chain: {}",

1359

                    orchestrator_para_id

1360

);

1361

3

                *cco = Some(orchestrator_para_id);

1362

3

                let (_, receiver) = futures::channel::oneshot::channel();

1363

3

                (CancellationToken::new(), receiver)

1364

3

};

1365

10

            let collate_on_tanssi: Arc<

1366

10

                dyn Fn() -> (CancellationToken, futures::channel::oneshot::Receiver<()>)

1367

10

                    + Send

1368

10

                    + Sync,

1369

10

            > = Arc::new(collate_closure);

1370

1371

10

            Self {

1372

10

                state: Arc::new(Mutex::new(ContainerChainSpawnerState {

1373

10

                    spawned_container_chains: Default::default(),

1374

10

                    assigned_para_id: Some(orchestrator_para_id),

1375

10

                    next_assigned_para_id: None,

1376

10

                    failed_para_ids: Default::default(),

1377

10

                    spawned_containers_monitor: Default::default(),

1378

10

                })),

1379

10

                orchestrator_para_id,

1380

10

                collate_on_tanssi,

1381

10

                // Some if collator starts on orchestrator chain

1382

10

                collation_cancellation_constructs: Some(()),

1383

10

                currently_collating_on,

1384

10

1385

10

1386

1387

21

        fn spawn(&self, container_chain_para_id: ParaId, start_collation: bool) {

1388

21

            let (signal, _on_exit) = oneshot::channel();

1389

21

            let currently_collating_on2 = self.currently_collating_on.clone();

1390

21

            let collate_closure = move || {

1391

13

                let mut cco = currently_collating_on2.lock().unwrap();

1392

13

                assert_ne!(

1393

13

                    *cco,

1394

13

                    Some(container_chain_para_id),

1395

                    "Received CollateOn message when we were already collating on this chain: {}",

1396

                    container_chain_para_id

1397

);

1398

13

                *cco = Some(container_chain_para_id);

1399

13

                let (_, receiver) = futures::channel::oneshot::channel();

1400

13

                (CancellationToken::new(), receiver)

1401

13

};

1402

21

            let collate_on: Arc<

1403

21

                dyn Fn() -> (CancellationToken, futures::channel::oneshot::Receiver<()>)

1404

21

                    + Send

1405

21

                    + Sync,

1406

21

            > = Arc::new(collate_closure);

1407

            // Dummy db_path for tests, is not actually used

1408

21

            let db_path = PathBuf::from(format!("/tmp/container-{}/db", container_chain_para_id));

1409

1410

21

            let old = self

1411

21

                .state

1412

21

                .lock()

1413

21

                .expect("poison error")

1414

21

                .spawned_container_chains

1415

21

                .insert(

1416

21

                    container_chain_para_id,

1417

21

                    ContainerChainState {

1418

21

                        stop_handle: StopContainerChain { signal, id: 0 },

1419

21

                        db_path,

1420

21

},

1421

);

1422

1423

21

            assert!(

1424

21

                old.is_none(),

1425

                "tried to spawn a container chain that was already running: {}",

1426

                container_chain_para_id

1427

);

1428

1429

21

            if start_collation {

1430

13

                let (_cancellation_token, _exit_receiver) = collate_on();

1431

13

1432

21

1433

1434

15

        fn stop(&self, container_chain_para_id: ParaId) {

1435

15

            let stop_handle = self

1436

15

                .state

1437

15

                .lock()

1438

15

                .expect("poison error")

1439

15

                .spawned_container_chains

1440

15

                .remove(&container_chain_para_id);

1441

1442

15

            match stop_handle {

1443

15

                Some(_stop_handle) => {

1444

15

                    log::info!("Stopping container chain {}", container_chain_para_id);

1445

1446

                None => {

1447

                    panic!(

1448

                        "Tried to stop a container chain that is not running: {}",

1449

                        container_chain_para_id

1450

);

1451

1452

1453

1454

            // Update currently_collating_on, if we stopped the chain we are no longer collating there

1455

15

            let mut lco = self.currently_collating_on.lock().unwrap();

1456

15

            if *lco == Some(container_chain_para_id) {

1457

7

                *lco = None;

1458

8

1459

15

1460

1461

35

        fn handle_update_assignment(&mut self, current: Option<ParaId>, next: Option<ParaId>) {

1462

            let HandleUpdateAssignmentResult {

1463

35

                chains_to_stop,

1464

35

                chains_to_start,

1465

35

                need_to_restart,

1466

35

            } = handle_update_assignment_state_change(

1467

35

                &mut self.state.lock().unwrap(),

1468

35

                self.orchestrator_para_id,

1469

35

                current,

1470

35

                next,

1471

35

);

1472

1473

35

            if current != Some(self.orchestrator_para_id) {

1474

                // If not assigned to orchestrator chain anymore, we need to stop the collator process

1475

27

                let mut cco = self.currently_collating_on.lock().unwrap();

1476

27

                if *cco == Some(self.orchestrator_para_id) {

1477

10

                    *cco = None;

1478

17

1479

27

                self.collation_cancellation_constructs = None;

1480

8

            } else if self.collation_cancellation_constructs.is_none() {

1481

3

                let (_cancellation_token, _exit_notification_receiver) = (self.collate_on_tanssi)();

1482

3

                self.collation_cancellation_constructs = Some(());

1483

5

1484

1485

            // Assert we never start and stop the same container chain

1486

56

            for para_id in &chains_to_start {

1487

21

                if !need_to_restart {

1488

4

                    assert!(

1489

4

                        !chains_to_stop.contains(para_id),

1490

                        "Tried to start and stop same container chain: {}",

1491

                        para_id

1492

);

1493

                } else {

1494

                    // Will try to start and stop container chain with id "current" or "next", so ignore that

1495

17

                    if Some(*para_id) != current && Some(*para_id) != next {

1496

                        assert!(

1497

                            !chains_to_stop.contains(para_id),

1498

                            "Tried to start and stop same container chain: {}",

1499

                            para_id

1500

);

1501

17

1502

1503

1504

            // Assert we never start or stop the orchestrator chain

1505

35

            assert!(!chains_to_start.contains(&self.orchestrator_para_id));

1506

35

            assert!(!chains_to_stop.contains(&self.orchestrator_para_id));

1507

1508

            // Stop all container chains that are no longer needed

1509

50

            for para_id in chains_to_stop {

1510

15

                self.stop(para_id);

1511

15

1512

1513

            // Start all new container chains (usually 1)

1514

56

            for para_id in chains_to_start {

1515

21

                // Edge case: when starting the node it may be assigned to a container chain, so we need to

1516

21

                // start a container chain already collating.

1517

21

                let start_collation = Some(para_id) == current;

1518

21

                self.spawn(para_id, start_collation);

1519

21

1520

1521

            // Assert that if we are currently assigned to a container chain, we are collating there

1522

35

            if let Some(para_id) = current {

1523

24

                self.assert_collating_on(Some(para_id));

1524

24

            } else {

1525

11

                self.assert_collating_on(None);

1526

11

1527

35

1528

1529

        #[track_caller]

1530

71

        fn assert_collating_on(&self, para_id: Option<ParaId>) {

1531

71

            let currently_collating_on = *self.currently_collating_on.lock().unwrap();

1532

71

            assert_eq!(currently_collating_on, para_id);

1533

71

1534

1535

        #[track_caller]

1536

36

        fn assert_running_chains(&self, para_ids: &[ParaId]) {

1537

36

            let mut actually_running: Vec<ParaId> = self

1538

36

                .state

1539

36

                .lock()

1540

36

                .unwrap()

1541

36

                .spawned_container_chains

1542

36

                .keys()

1543

36

                .cloned()

1544

36

                .collect();

1545

36

            actually_running.sort();

1546

36

            let mut should_be_running = para_ids.to_vec();

1547

36

            should_be_running.sort();

1548

36

            assert_eq!(actually_running, should_be_running);

1549

36

1550

1551

1552

    #[test]

1553

1

    fn starts_collating_on_tanssi() {

1554

1

        let mut m = MockContainerChainSpawner::new();

1555

1

        m.assert_collating_on(Some(1000.into()));

1556

1

        m.assert_running_chains(&[]);

1557

1558

1

        m.handle_update_assignment(None, None);

1559

1

        m.assert_collating_on(None);

1560

1

        m.assert_running_chains(&[]);

1561

1

1562

1563

    #[test]

1564

1

    fn assigned_to_orchestrator_chain() {

1565

1

        let mut m = MockContainerChainSpawner::new();

1566

1567

1

        m.handle_update_assignment(Some(1000.into()), Some(1000.into()));

1568

1

        m.assert_collating_on(Some(1000.into()));

1569

1

        m.assert_running_chains(&[]);

1570

1571

1

        m.handle_update_assignment(Some(1000.into()), None);

1572

1

        m.assert_collating_on(Some(1000.into()));

1573

1

        m.assert_running_chains(&[]);

1574

1575

1

        m.handle_update_assignment(None, None);

1576

1

        m.assert_collating_on(None);

1577

1

        m.assert_running_chains(&[]);

1578

1579

1

        m.handle_update_assignment(None, Some(1000.into()));

1580

1

        m.assert_collating_on(None);

1581

1

        m.assert_running_chains(&[]);

1582

1583

1

        m.handle_update_assignment(Some(1000.into()), Some(1000.into()));

1584

1

        m.assert_collating_on(Some(1000.into()));

1585

1

        m.assert_running_chains(&[]);

1586

1

1587

1588

    #[test]

1589

1

    fn assigned_to_container_chain() {

1590

1

        let mut m = MockContainerChainSpawner::new();

1591

1592

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1593

1

        m.assert_collating_on(Some(2000.into()));

1594

1

        m.assert_running_chains(&[2000.into()]);

1595

1596

1

        m.handle_update_assignment(Some(2000.into()), None);

1597

1

        m.assert_collating_on(Some(2000.into()));

1598

1

        m.assert_running_chains(&[2000.into()]);

1599

1600

1

        m.handle_update_assignment(None, None);

1601

1

        m.assert_collating_on(None);

1602

1

        m.assert_running_chains(&[]);

1603

1604

1

        m.handle_update_assignment(None, Some(2000.into()));

1605

1

        m.assert_collating_on(None);

1606

1

        m.assert_running_chains(&[2000.into()]);

1607

1608

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1609

1

        m.assert_collating_on(Some(2000.into()));

1610

1

        m.assert_running_chains(&[2000.into()]);

1611

1

1612

1613

    #[test]

1614

1

    fn spawn_container_chains() {

1615

1

        let mut m = MockContainerChainSpawner::new();

1616

1617

1

        m.handle_update_assignment(Some(1000.into()), Some(2000.into()));

1618

1

        m.assert_collating_on(Some(1000.into()));

1619

1

        m.assert_running_chains(&[2000.into()]);

1620

1621

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1622

1

        m.assert_collating_on(Some(2000.into()));

1623

1

        m.assert_running_chains(&[2000.into()]);

1624

1625

1

        m.handle_update_assignment(Some(2000.into()), Some(2001.into()));

1626

1

        m.assert_collating_on(Some(2000.into()));

1627

1

        m.assert_running_chains(&[2000.into(), 2001.into()]);

1628

1629

1

        m.handle_update_assignment(Some(2001.into()), Some(2001.into()));

1630

1

        m.assert_collating_on(Some(2001.into()));

1631

1

        m.assert_running_chains(&[2001.into()]);

1632

1633

1

        m.handle_update_assignment(Some(2001.into()), Some(1000.into()));

1634

1

        m.assert_collating_on(Some(2001.into()));

1635

1

        m.assert_running_chains(&[2001.into()]);

1636

1637

1

        m.handle_update_assignment(Some(1000.into()), Some(1000.into()));

1638

1

        m.assert_collating_on(Some(1000.into()));

1639

1

        m.assert_running_chains(&[]);

1640

1

1641

1642

    #[test]

1643

1

    fn swap_current_next() {

1644

        // Going from (2000, 2001) to (2001, 2000) shouldn't start or stop any container chains

1645

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1646

1647

1

        m.handle_update_assignment(Some(2000.into()), Some(2001.into()));

1648

1

        m.assert_collating_on(Some(2000.into()));

1649

1

        m.assert_running_chains(&[2000.into(), 2001.into()]);

1650

1651

1

        m.handle_update_assignment(Some(2001.into()), Some(2000.into()));

1652

1

        m.assert_collating_on(Some(2001.into()));

1653

1

        m.assert_running_chains(&[2000.into(), 2001.into()]);

1654

1

1655

1656

    #[test]

1657

1

    fn stop_collating_orchestrator() {

1658

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1659

1660

1

        m.handle_update_assignment(Some(1000.into()), Some(1000.into()));

1661

1

        m.assert_collating_on(Some(1000.into()));

1662

1

        m.assert_running_chains(&[]);

1663

1664

1

        m.handle_update_assignment(Some(1000.into()), None);

1665

1

        m.assert_collating_on(Some(1000.into()));

1666

1

        m.assert_running_chains(&[]);

1667

1668

1

        m.handle_update_assignment(None, None);

1669

1

        m.assert_collating_on(None);

1670

1

        m.assert_running_chains(&[]);

1671

1672

1

        m.handle_update_assignment(Some(1000.into()), None);

1673

1

        m.assert_collating_on(Some(1000.into()));

1674

1

        m.assert_running_chains(&[]);

1675

1

1676

1677

    #[test]

1678

1

    fn stop_collating_container() {

1679

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1680

1681

1

        m.handle_update_assignment(Some(2000.into()), None);

1682

1

        m.assert_collating_on(Some(2000.into()));

1683

1

        m.assert_running_chains(&[2000.into()]);

1684

1685

1

        m.handle_update_assignment(None, None);

1686

1

        m.assert_collating_on(None);

1687

1

        m.assert_running_chains(&[]);

1688

1689

1

        m.handle_update_assignment(None, Some(2000.into()));

1690

1

        m.assert_collating_on(None);

1691

1

        m.assert_running_chains(&[2000.into()]);

1692

1693

        // This will send a CollateOn message to the same chain as the last CollateOn,

1694

        // but this is needed because that chain has been stopped

1695

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1696

1

        m.assert_collating_on(Some(2000.into()));

1697

1

        m.assert_running_chains(&[2000.into()]);

1698

1

1699

1700

    #[test]

1701

1

    fn stop_collating_container_start_immediately() {

1702

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1703

1704

1

        m.handle_update_assignment(Some(2000.into()), None);

1705

1

        m.assert_collating_on(Some(2000.into()));

1706

1

        m.assert_running_chains(&[2000.into()]);

1707

1708

1

        m.handle_update_assignment(None, None);

1709

1

        m.assert_collating_on(None);

1710

1

        m.assert_running_chains(&[]);

1711

1712

        // This will start the chain already collating

1713

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1714

1

        m.assert_collating_on(Some(2000.into()));

1715

1

        m.assert_running_chains(&[2000.into()]);

1716

1

1717

1718

    #[test]

1719

1

    fn stop_all_chains() {

1720

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1721

1722

1

        m.handle_update_assignment(Some(2000.into()), Some(2001.into()));

1723

1

        m.assert_collating_on(Some(2000.into()));

1724

1

        m.assert_running_chains(&[2000.into(), 2001.into()]);

1725

1726

1

        m.handle_update_assignment(None, None);

1727

1

        m.assert_collating_on(None);

1728

1

        m.assert_running_chains(&[]);

1729

1

1730

1731

    #[test]

1732

1

    fn keep_collating_on_container() {

1733

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1734

1735

1

        m.handle_update_assignment(Some(2000.into()), None);

1736

1

        m.assert_collating_on(Some(2000.into()));

1737

1

        m.assert_running_chains(&[2000.into()]);

1738

1739

1

        m.handle_update_assignment(None, Some(2000.into()));

1740

1

        m.assert_collating_on(None);

1741

1

        m.assert_running_chains(&[2000.into()]);

1742

1743

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1744

1

        m.assert_collating_on(Some(2000.into()));

1745

1

        m.assert_running_chains(&[2000.into()]);

1746

1

1747

1748

    #[test]

1749

1

    fn invalid_boot_nodes_are_ignored() {

1750

1

        let para_id = 100.into();

1751

1

        let bootnode1 =

1752

1

            b"/ip4/127.0.0.1/tcp/33049/ws/p2p/12D3KooWHVMhQDHBpj9vQmssgyfspYecgV6e3hH1dQVDUkUbCYC9"

1753

1

                .to_vec();

1754

1

        assert_eq!(

1755

1

            parse_boot_nodes_ignore_invalid(vec![b"A".to_vec()], para_id),

1756

1

            vec![]

1757

);

1758

1

        assert_eq!(

1759

1

            parse_boot_nodes_ignore_invalid(vec![b"\xff".to_vec()], para_id),

1760

1

            vec![]

1761

);

1762

        // Valid boot nodes are not ignored

1763

1

        assert_eq!(

1764

1

            parse_boot_nodes_ignore_invalid(vec![bootnode1], para_id).len(),

1765

1766

);

1767

1

1768

1769

    #[test]

1770

1

    fn path_ancestors() {

1771

        // Test the implementation of `delete_container_chain_db`

1772

1

        let db_path = PathBuf::from("/tmp/zombienet/Collator2002-01/data/containers/chains/simple_container_2002/paritydb/full-container-2002");

1773

1

        let parent = db_path.ancestors().nth(2).unwrap();

1774

1775

1

        assert_eq!(

1776

            parent,

1777

1

            PathBuf::from(

1778

                "/tmp/zombienet/Collator2002-01/data/containers/chains/simple_container_2002"

1779

1780

1781

1

1782

1783

    #[test]

1784

1

    fn para_id_from_folder_name() {

1785

1

        assert_eq!(parse_para_id_from_folder_name(""), None,);

1786

1

        assert_eq!(parse_para_id_from_folder_name("full"), None,);

1787

1

        assert_eq!(parse_para_id_from_folder_name("full-container"), None,);

1788

1

        assert_eq!(parse_para_id_from_folder_name("full-container-"), None,);

1789

1

        assert_eq!(

1790

1

            parse_para_id_from_folder_name("full-container-2000"),

1791

1

            Some(ParaId::from(2000)),

1792

);

1793

1

1794