Grcov report - spawner.rs

1

// Copyright (C) Moondance Labs Ltd.

2

// This file is part of Tanssi.

3

4

// Tanssi is free software: you can redistribute it and/or modify

5

// it under the terms of the GNU General Public License as published by

6

// the Free Software Foundation, either version 3 of the License, or

7

// (at your option) any later version.

8

9

// Tanssi is distributed in the hope that it will be useful,

10

// but WITHOUT ANY WARRANTY; without even the implied warranty of

11

// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

12

// GNU General Public License for more details.

13

14

// You should have received a copy of the GNU General Public License

15

// along with Tanssi.  If not, see <http://www.gnu.org/licenses/>.

16

17

//! Container Chain Spawner

18

//!

19

//! Controls the starting and stopping of container chains.

20

//!

21

//! For more information about when the database is deleted, check the

22

//! [Keep db flowchart](https://raw.githubusercontent.com/moondance-labs/tanssi/master/docs/keep_db_flowchart.png)

23

24

use {

25

    crate::{

26

        cli::ContainerChainCli,

27

        monitor::{SpawnedContainer, SpawnedContainersMonitor},

28

        rpc::generate_rpc_builder::GenerateRpcBuilder,

29

        service::{

30

            start_node_impl_container, ContainerChainClient, MinimalContainerRuntimeApi,

31

            ParachainClient,

32

},

33

},

34

    cumulus_primitives_core::ParaId,

35

    cumulus_relay_chain_interface::RelayChainInterface,

36

    dancebox_runtime::{opaque::Block as OpaqueBlock, Block},

37

    dc_orchestrator_chain_interface::{OrchestratorChainInterface, PHash},

38

    frame_support::{CloneNoBound, DefaultNoBound},

39

    fs2::FileExt,

40

    futures::FutureExt,

41

    node_common::command::generate_genesis_block,

42

    pallet_author_noting_runtime_api::AuthorNotingApi,

43

    polkadot_primitives::CollatorPair,

44

    sc_cli::{Database, SyncMode},

45

    sc_network::config::MultiaddrWithPeerId,

46

    sc_service::SpawnTaskHandle,

47

    sc_transaction_pool::TransactionPoolHandle,

48

    sp_api::ProvideRuntimeApi,

49

    sp_core::H256,

50

    sp_keystore::KeystorePtr,

51

    sp_runtime::traits::Block as BlockT,

52

    std::{

53

        any::Any,

54

        collections::{HashMap, HashSet},

55

        marker::PhantomData,

56

        path::{Path, PathBuf},

57

        sync::{Arc, Mutex},

58

        time::Instant,

59

},

60

    tokio::{

61

        sync::{mpsc, oneshot},

62

        time::{sleep, Duration},

63

},

64

    tokio_util::sync::CancellationToken,

65

};

66

67

/// Timeout to wait for the database to close before starting it again, used in `wait_for_paritydb_lock`.

68

/// This is the max timeout, if the db is closed in 1 second then that function will only wait 1 second.

69

const MAX_DB_RESTART_TIMEOUT: Duration = Duration::from_secs(60);

70

71

/// Block diff threshold above which we decide it will be faster to delete the database and

72

/// use warp sync, rather than using full sync to download a large number of blocks.

73

/// This is only needed because warp sync does not support syncing from a state that is not

74

/// genesis, it falls back to full sync in that case.

75

/// 30_000 blocks = 50 hours at 6s/block.

76

/// Assuming a syncing speed of 100 blocks per second, this will take 5 minutes to sync.

77

const MAX_BLOCK_DIFF_FOR_FULL_SYNC: u32 = 30_000;

78

79

/// Task that handles spawning a stopping container chains based on assignment.

80

/// The main loop is [rx_loop](ContainerChainSpawner::rx_loop).

81

pub struct ContainerChainSpawner<

82

    RuntimeApi: MinimalContainerRuntimeApi,

83

    TGenerateRpcBuilder: GenerateRpcBuilder<RuntimeApi>,

84

> {

85

    /// Start container chain params

86

    pub params: ContainerChainSpawnParams<RuntimeApi, TGenerateRpcBuilder>,

87

88

    /// State

89

    pub state: Arc<Mutex<ContainerChainSpawnerState>>,

90

91

    /// Before the first assignment, there is a db cleanup process that removes folders of container

92

    /// chains that we are no longer assigned to.

93

    pub db_folder_cleanup_done: bool,

94

95

    /// Async callback that enables collation on the orchestrator chain

96

    pub collate_on_tanssi:

97

        Arc<dyn Fn() -> (CancellationToken, futures::channel::oneshot::Receiver<()>) + Send + Sync>,

98

    /// Stores the cancellation token used to stop the orchestrator chain collator process.

99

    /// When this is None, the orchestrator collator is not running.

100

    pub collation_cancellation_constructs:

101

        Option<(CancellationToken, futures::channel::oneshot::Receiver<()>)>,

102

103

104

/// Struct with all the params needed to start a container chain node given the CLI arguments,

105

/// and creating the ChainSpec from on-chain data from the orchestrator chain.

106

/// These params must be the same for all container chains, params that change such as the

107

/// `container_chain_para_id` should be passed as separate arguments to the [try_spawn] function.

108

///

109

/// This struct MUST NOT contain types (outside of `Option<CollationParams>`) obtained through

110

/// running an embeded orchestrator node, as this will prevent spawning a container chain in a node

111

/// connected to an orchestrator node through WebSocket.

112

#[derive(CloneNoBound)]

113

pub struct ContainerChainSpawnParams<

114

    RuntimeApi: MinimalContainerRuntimeApi,

115

    TGenerateRpcBuilder: GenerateRpcBuilder<RuntimeApi>,

116

> {

117

    pub orchestrator_chain_interface: Arc<dyn OrchestratorChainInterface>,

118

    pub container_chain_cli: ContainerChainCli,

119

    pub tokio_handle: tokio::runtime::Handle,

120

    pub chain_type: sc_chain_spec::ChainType,

121

    pub relay_chain: String,

122

    pub relay_chain_interface: Arc<dyn RelayChainInterface>,

123

    pub sync_keystore: KeystorePtr,

124

    pub orchestrator_para_id: ParaId,

125

    pub spawn_handle: SpawnTaskHandle,

126

    pub collation_params: Option<CollationParams>,

127

    pub data_preserver: bool,

128

    pub generate_rpc_builder: TGenerateRpcBuilder,

129

130

    pub phantom: PhantomData<RuntimeApi>,

131

132

133

/// Params specific to collation. This struct can contain types obtained through running an

134

/// embeded orchestrator node.

135

#[derive(Clone)]

136

pub struct CollationParams {

137

    pub collator_key: CollatorPair,

138

    pub orchestrator_tx_pool: Option<Arc<TransactionPoolHandle<OpaqueBlock, ParachainClient>>>,

139

    pub orchestrator_client: Option<Arc<ParachainClient>>,

140

    pub orchestrator_para_id: ParaId,

141

    /// If this is `false`, then `orchestrator_tx_pool` and `orchestrator_client` must be `Some`.

142

    pub solochain: bool,

143

144

145

/// Mutable state for container chain spawner. Keeps track of running chains.

146

#[derive(DefaultNoBound)]

147

pub struct ContainerChainSpawnerState {

148

    spawned_container_chains: HashMap<ParaId, ContainerChainState>,

149

    assigned_para_id: Option<ParaId>,

150

    next_assigned_para_id: Option<ParaId>,

151

    failed_para_ids: HashSet<ParaId>,

152

    // For debugging and detecting errors

153

    pub spawned_containers_monitor: SpawnedContainersMonitor,

154

155

156

pub struct ContainerChainState {

157

    /// Handle that can be used to stop the container chain

158

    stop_handle: StopContainerChain,

159

    /// Database path

160

    db_path: PathBuf,

161

162

163

/// Stops a container chain when signal is sent. The bool means `keep_db`, whether to keep the

164

/// container chain database (true) or remove it (false).

165

pub struct StopContainerChain {

166

    signal: oneshot::Sender<bool>,

167

    id: usize,

168

169

170

/// Messages used to control the `ContainerChainSpawner`. This is needed because one of the fields

171

/// of `ContainerChainSpawner` is not `Sync`, so we cannot simply pass an

172

/// `Arc<ContainerChainSpawner>` to other threads.

173

#[derive(Debug)]

174

pub enum CcSpawnMsg {

175

    /// Update container chain assignment

176

    UpdateAssignment {

177

        current: Option<ParaId>,

178

        next: Option<ParaId>,

179

},

180

181

182

// Separate function to allow using `?` to return a result, and also to avoid using `self` in an

183

// async function. Mutable state should be written by locking `state`.

184

// TODO: `state` should be an async mutex

185

async fn try_spawn<

186

    RuntimeApi: MinimalContainerRuntimeApi,

187

    TGenerateRpcBuilder: GenerateRpcBuilder<RuntimeApi>,

188

>(

189

    try_spawn_params: ContainerChainSpawnParams<RuntimeApi, TGenerateRpcBuilder>,

190

    state: Arc<Mutex<ContainerChainSpawnerState>>,

191

    container_chain_para_id: ParaId,

192

    start_collation: bool,

193

) -> sc_service::error::Result<()> {

194

    let ContainerChainSpawnParams {

195

        orchestrator_chain_interface,

196

        mut container_chain_cli,

197

        tokio_handle,

198

        chain_type,

199

        relay_chain,

200

        relay_chain_interface,

201

        sync_keystore,

202

        spawn_handle,

203

        mut collation_params,

204

        data_preserver,

205

        generate_rpc_builder,

206

..

207

    } = try_spawn_params;

208

    // Preload genesis data from orchestrator chain storage.

209

210

    // TODO: the orchestrator chain node may not be fully synced yet,

211

    // in that case we will be reading an old state.

212

    let orchestrator_block_hash = orchestrator_chain_interface

213

        .finalized_block_hash()

214

        .await

215

        .map_err(|e| format!("Failed to get latest block hash: {e}"))?;

216

217

    log::info!(

218

        "Detected assignment for container chain {}",

219

        container_chain_para_id

220

);

221

222

    let genesis_data = orchestrator_chain_interface

223

        .genesis_data(orchestrator_block_hash, container_chain_para_id)

224

        .await

225

        .map_err(|e| format!("Failed to call genesis_data runtime api: {}", e))?

226

        .ok_or_else(|| {

227

            format!(

228

                "No genesis data registered for container chain id {}",

229

                container_chain_para_id

230

231

        })?;

232

233

    let boot_nodes_raw = orchestrator_chain_interface

234

        .boot_nodes(orchestrator_block_hash, container_chain_para_id)

235

        .await

236

        .map_err(|e| format!("Failed to call boot_nodes runtime api: {}", e))?;

237

238

    if boot_nodes_raw.is_empty() {

239

        log::warn!(

240

            "No boot nodes registered on-chain for container chain {}",

241

            container_chain_para_id

242

);

243

244

    let boot_nodes = parse_boot_nodes_ignore_invalid(boot_nodes_raw, container_chain_para_id);

245

    if boot_nodes.is_empty() {

246

        log::warn!(

247

            "No valid boot nodes for container chain {}",

248

            container_chain_para_id

249

);

250

251

252

    container_chain_cli

253

        .preload_chain_spec_from_genesis_data(

254

            container_chain_para_id.into(),

255

            genesis_data,

256

            chain_type.clone(),

257

            relay_chain.clone(),

258

            boot_nodes,

259

260

        .map_err(|e| {

261

            format!(

262

                "failed to create container chain chain spec from on chain genesis data: {}",

263

264

265

        })?;

266

267

    log::info!(

268

        "Loaded chain spec for container chain {}",

269

        container_chain_para_id

270

);

271

272

    if !data_preserver && !start_collation {

273

        log::info!("This is a syncing container chain, using random ports");

274

275

        collation_params = None;

276

277

        // Use random ports to avoid conflicts with the other running container chain

278

        let random_ports = [23456, 23457, 23458];

279

280

        container_chain_cli

281

            .base

282

            .base

283

            .prometheus_params

284

            .prometheus_port = Some(random_ports[0]);

285

        container_chain_cli.base.base.network_params.port = Some(random_ports[1]);

286

        container_chain_cli.base.base.rpc_params.rpc_port = Some(random_ports[2]);

287

288

289

    let validator = collation_params.is_some();

290

291

    // Update CLI params

292

    container_chain_cli.base.para_id = Some(container_chain_para_id.into());

293

    container_chain_cli

294

        .base

295

        .base

296

        .import_params

297

        .database_params

298

        .database = Some(Database::ParityDb);

299

300

    let keep_db = container_chain_cli.base.keep_db;

301

302

    // Get a closure that checks if db_path exists.Need this to know when to use full sync instead of warp sync.

303

    let check_db_exists = {

304

        // Get db_path from config

305

        let mut container_chain_cli_config = sc_cli::SubstrateCli::create_configuration(

306

            &container_chain_cli,

307

            &container_chain_cli,

308

            tokio_handle.clone(),

309

310

        .map_err(|err| format!("Container chain argument error: {}", err))?;

311

312

        // Change database path to make it depend on container chain para id

313

        // So instead of the usual "db/full" we have "db/full-container-2000"

314

        let mut db_path = container_chain_cli_config

315

            .database

316

            .path()

317

            .ok_or_else(|| "Failed to get database path".to_string())?

318

            .to_owned();

319

        db_path.set_file_name(format!("full-container-{}", container_chain_para_id));

320

        container_chain_cli_config.database.set_path(&db_path);

321

322

        // Return a closure because we may need to check if the db exists multiple times

323

        move || db_path.exists()

324

};

325

326

    // Start container chain node. After starting, check if the database is good or needs to

327

    // be removed. If the db needs to be removed, this function will handle the node restart, and

328

    // return the components of a running container chain node.

329

    // This should be a separate function, but it has so many arguments that I prefer to have it as a closure for now

330

    let start_node_impl_container_with_restart = || async move {

331

        // Loop will run at most 2 times: 1 time if the db is good and 2 times if the db needs to be removed

332

        for _ in 0..2 {

333

            let db_existed_before = check_db_exists();

334

            container_chain_cli.base.base.network_params.sync = SyncMode::Warp;

335

            log::info!(

336

                "Container chain sync mode: {:?}",

337

                container_chain_cli.base.base.network_params.sync

338

);

339

340

            let mut container_chain_cli_config = sc_cli::SubstrateCli::create_configuration(

341

                &container_chain_cli,

342

                &container_chain_cli,

343

                tokio_handle.clone(),

344

345

            .map_err(|err| format!("Container chain argument error: {}", err))?;

346

347

            // Change database path to make it depend on container chain para id

348

            // So instead of the usual "db/full" we have "db/full-container-2000"

349

            let mut db_path = container_chain_cli_config

350

                .database

351

                .path()

352

                .ok_or_else(|| "Failed to get database path".to_string())?

353

                .to_owned();

354

            db_path.set_file_name(format!("full-container-{}", container_chain_para_id));

355

            container_chain_cli_config.database.set_path(&db_path);

356

357

            let (container_chain_task_manager, container_chain_client, container_chain_db) =

358

                start_node_impl_container(

359

                    container_chain_cli_config,

360

                    relay_chain_interface.clone(),

361

                    orchestrator_chain_interface.clone(),

362

                    sync_keystore.clone(),

363

                    container_chain_para_id,

364

                    collation_params.clone(),

365

                    generate_rpc_builder.clone(),

366

                    &container_chain_cli,

367

                    data_preserver,

368

369

                .await?;

370

371

            // Keep all node parts in one variable to make them easier to drop

372

            let node_parts = (

373

                container_chain_task_manager,

374

                container_chain_client,

375

                container_chain_db,

376

                db_path,

377

);

378

379

            if db_existed_before {

380

                // If the database already existed before, check if it can be used or it needs to be removed.

381

                // To remove the database, we restart the node, wait for the db to close to avoid a

382

                // "shutdown error" log, and then remove it.

383

                if let Some(db_removal_reason) = db_needs_removal(

384

                    &node_parts.1,

385

                    &orchestrator_chain_interface,

386

                    orchestrator_block_hash,

387

                    container_chain_para_id,

388

                    &container_chain_cli,

389

                    container_chain_cli.base.keep_db,

390

391

                .await?

392

393

                    let db_path = node_parts.3.clone();

394

                    // Important, drop `node_parts` before trying to `wait_for_paritydb_lock`

395

                    drop(node_parts);

396

                    // Wait here to for the database created in the previous loop iteration to close.

397

                    // Dropping is not enough because there is some background process that keeps the database open,

398

                    // so we check the paritydb lock file directly.

399

                    log::info!(

400

                        "Restarting container chain {} after db deletion. Reason: {:?}",

401

                        container_chain_para_id,

402

                        db_removal_reason,

403

);

404

                    wait_for_paritydb_lock(&db_path, MAX_DB_RESTART_TIMEOUT)

405

                        .await

406

                        .map_err(|e| {

407

                            log::warn!(

408

                                "Error waiting for chain {} to release db lock: {:?}",

409

                                container_chain_para_id,

410

411

);

412

413

414

                        })?;

415

                    delete_container_chain_db(&db_path);

416

417

                    // Recursion, will only happen once because `db_existed_before` will be false after

418

                    // removing the db. Apparently closures cannot be recursive so fake recursion by

419

                    // using a loop + continue

420

                    continue;

421

422

423

424

            // If using full sync, print a warning if the local db is at block 0 and the chain has thousands of blocks

425

            if container_chain_cli.base.base.network_params.sync == SyncMode::Full {

426

                let last_container_block_temp = node_parts.1.chain_info().best_number;

427

                let cc_block_num = get_latest_container_block_number_from_orchestrator(

428

                    &orchestrator_chain_interface,

429

                    orchestrator_block_hash,

430

                    container_chain_para_id,

431

432

                .await

433

                .unwrap_or(0);

434

                if last_container_block_temp == 0 && cc_block_num > MAX_BLOCK_DIFF_FOR_FULL_SYNC {

435

                    let db_folder = format!("full-container-{}", container_chain_para_id);

436

                    log::error!("\

437

                        Existing database for container chain {} is at block 0, assuming that warp sync failed.\n\

438

                        The node will now use full sync, which has to download {} blocks.\n\

439

                        If running as collator, it may not finish syncing on time and miss block rewards.\n\

440

                        To force using warp sync, stop tanssi-node and manually remove the db folder: {:?}\n\

441

                        ", container_chain_para_id, cc_block_num, db_folder)

442

443

444

445

            return sc_service::error::Result::Ok(node_parts);

446

447

448

        unreachable!("Above loop can run at most 2 times, and in the second iteration it is guaranteed to return")

449

};

450

451

    let (mut container_chain_task_manager, container_chain_client, container_chain_db, db_path) =

452

        start_node_impl_container_with_restart().await?;

453

454

    // Signal that allows to gracefully stop a container chain

455

    let (signal, on_exit) = oneshot::channel::<bool>();

456

457

    let monitor_id;

458

459

        let mut state = state.lock().expect("poison error");

460

        let container_chain_client = container_chain_client as Arc<dyn Any + Sync + Send>;

461

462

        monitor_id = state.spawned_containers_monitor.push(SpawnedContainer {

463

            id: 0,

464

            para_id: container_chain_para_id,

465

            start_time: Instant::now(),

466

            stop_signal_time: None,

467

            stop_task_manager_time: None,

468

            stop_refcount_time: Default::default(),

469

            backend: Arc::downgrade(&container_chain_db),

470

            client: Arc::downgrade(&container_chain_client),

471

});

472

473

        if state

474

            .spawned_container_chains

475

            .contains_key(&container_chain_para_id)

476

477

            return Err(format!("Tried to spawn a container chain when another container chain with the same para id was already running: {:?}", container_chain_para_id).into());

478

479

        state.spawned_container_chains.insert(

480

            container_chain_para_id,

481

            ContainerChainState {

482

                stop_handle: StopContainerChain {

483

                    signal,

484

                    id: monitor_id,

485

},

486

                db_path: db_path.clone(),

487

},

488

);

489

490

491

    // Add the container chain task manager as a child task to the parent task manager.

492

    // We want to stop the node if this task manager stops, but we also want to allow a

493

    // graceful shutdown using the `on_exit` future.

494

    let name = "container-chain-task-manager";

495

    spawn_handle.spawn(name, None, async move {

496

        let mut container_chain_task_manager_future =

497

            container_chain_task_manager.future().fuse();

498

        let mut on_exit_future = on_exit.fuse();

499

500

        futures::select! {

501

            res1 = container_chain_task_manager_future => {

502

                // An essential task failed or the task manager was stopped unexpectedly

503

                // using `.terminate()`. This should stop the container chain but not the node.

504

                if res1.is_err() {

505

                    log::error!("Essential task failed in container chain {} task manager. Shutting down container chain service", container_chain_para_id);

506

                } else {

507

                    log::error!("Unexpected shutdown in container chain {} task manager. Shutting down container chain service", container_chain_para_id);

508

509

                // Mark this container chain as "failed to stop" to avoid warning in `self.stop()`

510

                let mut state = state.lock().expect("poison error");

511

                state.failed_para_ids.insert(container_chain_para_id);

512

                // Never delete db in this case because it is not a graceful shutdown

513

514

            stop_unassigned = on_exit_future => {

515

                // Graceful shutdown.

516

                // `stop_unassigned` will be `Ok(keep_db)` if `.stop()` has been called, which means that the

517

                // container chain has been unassigned, and will be `Err` if the handle has been dropped,

518

                // which means that the node is stopping.

519

                // Delete existing database if running as collator

520

                if validator && stop_unassigned == Ok(false) && !keep_db {

521

                    // If this breaks after a code change, make sure that all the variables that

522

                    // may keep the chain alive are dropped before the call to `wait_for_paritydb_lock`.

523

                    drop(container_chain_task_manager_future);

524

                    drop(container_chain_task_manager);

525

                    let db_closed = wait_for_paritydb_lock(&db_path, MAX_DB_RESTART_TIMEOUT)

526

                        .await

527

                        .map_err(|e| {

528

                            log::warn!(

529

                                "Error waiting for chain {} to release db lock: {:?}",

530

                                container_chain_para_id,

531

532

);

533

                        }).is_ok();

534

                    // If db has not closed in 60 seconds we do not delete it.

535

                    if db_closed {

536

                        delete_container_chain_db(&db_path);

537

538

539

540

541

542

        let mut state = state.lock().expect("poison error");

543

        state

544

            .spawned_containers_monitor

545

            .set_stop_task_manager_time(monitor_id, Instant::now());

546

});

547

548

    Ok(())

549

550

551

/// Interface for spawning and stopping container chain embeded nodes.

552

pub trait Spawner {

553

    /// Access to the Orchestrator Chain Interface

554

    fn orchestrator_chain_interface(&self) -> Arc<dyn OrchestratorChainInterface>;

555

556

    /// Try to start a new container chain. In case of an error, this does not stop the node, and

557

    /// the container chain will be attempted to spawn again when the collator is reassigned to it.

558

///

559

    /// It is possible that we try to spawn-stop-spawn the same chain, and the second spawn fails

560

    /// because the chain has not stopped yet, because `stop` does not wait for the chain to stop,

561

    /// so before calling `spawn` make sure to call `wait_for_paritydb_lock` before, like we do in

562

    /// `handle_update_assignment`.

563

    fn spawn(

564

        &self,

565

        container_chain_para_id: ParaId,

566

        start_collation: bool,

567

    ) -> impl std::future::Future<Output = ()> + Send;

568

569

    /// Stop a container chain. Prints a warning if the container chain was not running.

570

    /// Returns the database path for the container chain, can be used with `wait_for_paritydb_lock`

571

    /// to ensure that the container chain has fully stopped. The database path can be `None` if the

572

    /// chain was not running.

573

    fn stop(&self, container_chain_para_id: ParaId, keep_db: bool) -> Option<PathBuf>;

574

575

576

impl<

577

        RuntimeApi: MinimalContainerRuntimeApi,

578

        TGenerateRpcBuilder: GenerateRpcBuilder<RuntimeApi>,

579

    > Spawner for ContainerChainSpawner<RuntimeApi, TGenerateRpcBuilder>

580

581

    /// Access to the Orchestrator Chain Interface

582

    fn orchestrator_chain_interface(&self) -> Arc<dyn OrchestratorChainInterface> {

583

        self.params.orchestrator_chain_interface.clone()

584

585

586

    /// Try to start a new container chain. In case of an error, this does not stop the node, and

587

    /// the container chain will be attempted to spawn again when the collator is reassigned to it.

588

///

589

    /// It is possible that we try to spawn-stop-spawn the same chain, and the second spawn fails

590

    /// because the chain has not stopped yet, because `stop` does not wait for the chain to stop,

591

    /// so before calling `spawn` make sure to call `wait_for_paritydb_lock` before, like we do in

592

    /// `handle_update_assignment`.

593

    async fn spawn(&self, container_chain_para_id: ParaId, start_collation: bool) {

594

        let try_spawn_params = self.params.clone();

595

        let state = self.state.clone();

596

        let state2 = state.clone();

597

598

        match try_spawn(

599

            try_spawn_params,

600

            state,

601

            container_chain_para_id,

602

            start_collation,

603

604

        .await

605

606

            Ok(()) => {}

607

            Err(e) => {

608

                log::error!(

609

                    "Failed to start container chain {}: {}",

610

                    container_chain_para_id,

611

612

);

613

                // Mark this container chain as "failed to start"

614

                let mut state = state2.lock().expect("poison error");

615

                state.failed_para_ids.insert(container_chain_para_id);

616

617

618

619

620

    /// Stop a container chain. Prints a warning if the container chain was not running.

621

    /// Returns the database path for the container chain, can be used with `wait_for_paritydb_lock`

622

    /// to ensure that the container chain has fully stopped. The database path can be `None` if the

623

    /// chain was not running.

624

    fn stop(&self, container_chain_para_id: ParaId, keep_db: bool) -> Option<PathBuf> {

625

        let mut state = self.state.lock().expect("poison error");

626

        let stop_handle = state

627

            .spawned_container_chains

628

            .remove(&container_chain_para_id);

629

630

        match stop_handle {

631

            Some(stop_handle) => {

632

                log::info!("Stopping container chain {}", container_chain_para_id);

633

634

                let id = stop_handle.stop_handle.id;

635

                state

636

                    .spawned_containers_monitor

637

                    .set_stop_signal_time(id, Instant::now());

638

639

                // Send signal to perform graceful shutdown, which will delete the db if needed

640

                let _ = stop_handle.stop_handle.signal.send(keep_db);

641

642

                Some(stop_handle.db_path)

643

644

            None => {

645

                // Do not print the warning message if this is a container chain that has failed to

646

                // start, because in that case it will not be running

647

                if !state.failed_para_ids.remove(&container_chain_para_id) {

648

                    log::warn!(

649

                        "Tried to stop a container chain that is not running: {}",

650

                        container_chain_para_id

651

);

652

653

654

                None

655

656

657

658

659

660

impl<

661

        RuntimeApi: MinimalContainerRuntimeApi,

662

        TGenerateRpcBuilder: GenerateRpcBuilder<RuntimeApi>,

663

    > ContainerChainSpawner<RuntimeApi, TGenerateRpcBuilder>

664

665

    /// Receive and process `CcSpawnMsg`s indefinitely

666

    pub async fn rx_loop(

667

        mut self,

668

        mut rx: mpsc::UnboundedReceiver<CcSpawnMsg>,

669

        validator: bool,

670

        solochain: bool,

671

) {

672

        // The node always starts as an orchestrator chain collator.

673

        // This is because the assignment is detected after importing a new block, so if all

674

        // collators stop at the same time, when they start again nobody will produce the new block.

675

        // So all nodes start as orchestrator chain collators, until the first block is imported,

676

        // then the real assignment is used.

677

        // Except in solochain mode, then the initial assignment is None.

678

        if validator && !solochain {

679

            self.handle_update_assignment(Some(self.params.orchestrator_para_id), None)

680

                .await;

681

682

683

        while let Some(msg) = rx.recv().await {

684

            match msg {

685

                CcSpawnMsg::UpdateAssignment { current, next } => {

686

                    self.handle_update_assignment(current, next).await;

687

688

689

690

691

        // The while loop can end if all the senders get dropped, but since this is an

692

        // essential task we don't want it to stop. So await a future that never completes.

693

        // This should only happen when starting a full node.

694

        if !validator {

695

            let () = std::future::pending().await;

696

697

698

699

    /// Handle `CcSpawnMsg::UpdateAssignment`

700

    async fn handle_update_assignment(&mut self, current: Option<ParaId>, next: Option<ParaId>) {

701

        if !self.db_folder_cleanup_done {

702

            self.db_folder_cleanup_done = true;

703

704

            // Disabled when running with --keep-db

705

            let keep_db = self.params.container_chain_cli.base.keep_db;

706

            if !keep_db {

707

                let mut chains_to_keep = HashSet::new();

708

                chains_to_keep.extend(current);

709

                chains_to_keep.extend(next);

710

                self.db_folder_cleanup(&chains_to_keep);

711

712

713

714

        let HandleUpdateAssignmentResult {

715

            chains_to_stop,

716

            chains_to_start,

717

            need_to_restart: _,

718

        } = handle_update_assignment_state_change(

719

            &mut self.state.lock().expect("poison error"),

720

            self.params.orchestrator_para_id,

721

            current,

722

            next,

723

);

724

725

        if current != Some(self.params.orchestrator_para_id) {

726

            // If not assigned to orchestrator chain anymore, we need to stop the collator process

727

            let maybe_exit_notification_receiver = self

728

                .collation_cancellation_constructs

729

                .take()

730

                .map(|(cancellation_token, exit_notification_receiver)| {

731

                    cancellation_token.cancel();

732

                    exit_notification_receiver

733

});

734

735

            if let Some(exit_notification_receiver) = maybe_exit_notification_receiver {

736

                let _ = exit_notification_receiver.await;

737

738

        } else if self.collation_cancellation_constructs.is_none() {

739

            // If assigned to orchestrator chain but the collator process is not running, start it

740

            self.collation_cancellation_constructs = Some((self.collate_on_tanssi)());

741

742

743

        // Stop all container chains that are no longer needed

744

        let mut db_paths_restart = vec![];

745

        for para_id in chains_to_stop {

746

            // Keep db if we are currently assigned to this chain

747

            let keep_db = Some(para_id) == current;

748

            let maybe_db_path = self.stop(para_id, keep_db);

749

            // If we are restarting this chain, save its db_path to check when it actually stopped

750

            if let Some(db_path) = maybe_db_path {

751

                if chains_to_start.contains(&para_id) {

752

                    db_paths_restart.push((para_id, db_path));

753

754

755

756

757

        if !db_paths_restart.is_empty() {

758

            // Ensure the chains we stopped actually stopped by checking if their database is unlocked.

759

            // Using `join_all` because in one edge case we may be restarting 2 chains,

760

            // but almost always this will be only one future.

761

            let futs = db_paths_restart

762

                .into_iter()

763

                .map(|(para_id, db_path)| async move {

764

                    wait_for_paritydb_lock(&db_path, MAX_DB_RESTART_TIMEOUT)

765

                        .await

766

                        .map_err(|e| {

767

                            log::warn!(

768

                                "Error waiting for chain {} to release db lock: {:?}",

769

                                para_id,

770

771

);

772

})

773

});

774

            futures::future::join_all(futs).await;

775

776

777

        // Start all new container chains (usually 1)

778

        for para_id in chains_to_start {

779

            // Edge case: when starting the node it may be assigned to a container chain, so we need to

780

            // start a container chain already collating.

781

            // TODO: another edge case: if current == None, and running_chains == 0,

782

            // and chains_to_start == 1, we can start this chain as collating, and we won't need

783

            // to restart it on the next session. We need to add some extra state somewhere to

784

            // implement this properly.

785

            let start_collation = Some(para_id) == current;

786

            self.spawn(para_id, start_collation).await;

787

788

789

790

    fn db_folder_cleanup(&self, chains_to_keep: &HashSet<ParaId>) {

791

        // "containers" folder

792

        let mut base_path = self

793

            .params

794

            .container_chain_cli

795

            .base

796

            .base

797

            .shared_params

798

            .base_path

799

            .as_ref()

800

            .expect("base_path is always set")

801

            .to_owned();

802

803

        // "containers/chains"

804

        base_path.push("chains");

805

806

        // Inside chains folder we have container folders such as

807

        // containers/chains/simple_container_2000/

808

        // containers/chains/frontier_container_2001/

809

        // But this is not the para id, it's the chain id which we have set to include the para id, but that's not mandatory.

810

        // To get the para id we need to look for the paritydb folder:

811

        // containers/chains/frontier_container_2001/paritydb/full-container-2001/

812

        let mut chain_folders = sort_container_folders_by_para_id(&base_path);

813

814

        // Keep chains that we are assigned to

815

        for para_id in chains_to_keep {

816

            chain_folders.remove(&Some(*para_id));

817

818

819

        // Print nice log message when removing folders

820

        if !chain_folders.is_empty() {

821

            let chain_folders_fmt = chain_folders

822

                .iter()

823

                .flat_map(|(para_id, vec_paths)| {

824

                    let para_id_fmt = if let Some(para_id) = para_id {

825

                        para_id.to_string()

826

                    } else {

827

                        "None".to_string()

828

};

829

                    vec_paths

830

                        .iter()

831

                        .map(move |path| format!("\n{}: {}", para_id_fmt, path.display()))

832

})

833

                .collect::<String>();

834

            log::info!(

835

                "db_folder_cleanup: removing container folders: (para_id, path):{}",

836

                chain_folders_fmt

837

);

838

839

840

        // Remove, ignoring errors

841

        for (_para_id, folders) in chain_folders {

842

            for folder in folders {

843

                let _ = std::fs::remove_dir_all(&folder);

844

845

846

847

848

849

struct HandleUpdateAssignmentResult {

850

    chains_to_stop: Vec<ParaId>,

851

    chains_to_start: Vec<ParaId>,

852

    #[allow(dead_code)] // no longer used except in tests

853

    need_to_restart: bool,

854

855

856

// This is a separate function to allow testing

857

35

fn handle_update_assignment_state_change(

858

35

    state: &mut ContainerChainSpawnerState,

859

35

    orchestrator_para_id: ParaId,

860

35

    current: Option<ParaId>,

861

35

    next: Option<ParaId>,

862

35

) -> HandleUpdateAssignmentResult {

863

35

    if (state.assigned_para_id, state.next_assigned_para_id) == (current, next) {

864

        // If nothing changed there is nothing to update

865

        return HandleUpdateAssignmentResult {

866

            chains_to_stop: Default::default(),

867

            chains_to_start: Default::default(),

868

            need_to_restart: false,

869

};

870

35

871

35

872

35

    // Create a set with the container chains that were running before, and the container

873

35

    // chains that should be running after the updated assignment. This is used to calculate

874

35

    // the difference, and stop and start the required container chains.

875

35

    let mut running_chains_before = HashSet::new();

876

35

    let mut running_chains_after = HashSet::new();

877

35

878

35

    running_chains_before.extend(state.assigned_para_id);

879

35

    running_chains_before.extend(state.next_assigned_para_id);

880

35

    // Ignore orchestrator_para_id because it is handled in a special way, as it does not need to

881

35

    // start one session before in order to sync.

882

35

    running_chains_before.remove(&orchestrator_para_id);

883

35

884

35

    running_chains_after.extend(current);

885

35

    running_chains_after.extend(next);

886

35

    running_chains_after.remove(&orchestrator_para_id);

887

35

    let mut need_to_restart_current = false;

888

35

    let mut need_to_restart_next = false;

889

35

890

35

    if state.assigned_para_id != current {

891

24

        if let Some(para_id) = current {

892

            // If the assigned container chain has changed, we may need to

893

            // restart it in collation mode, unless it is the orchestrator chain.

894

16

            if para_id != orchestrator_para_id {

895

13

                need_to_restart_current = true;

896

13

897

8

898

899

24

        if let Some(para_id) = state.assigned_para_id {

900

18

            if para_id != orchestrator_para_id && Some(para_id) == next {

901

2

                need_to_restart_next = true;

902

16

903

6

904

11

905

906

35

    state.assigned_para_id = current;

907

35

    state.next_assigned_para_id = next;

908

35

909

35

    let mut chains_to_stop: Vec<_> = running_chains_before

910

35

        .difference(&running_chains_after)

911

35

        .copied()

912

35

        .collect();

913

35

    let mut chains_to_start: Vec<_> = running_chains_after

914

35

        .difference(&running_chains_before)

915

35

        .copied()

916

35

        .collect();

917

35

918

35

    if need_to_restart_current {

919

        // Force restart of new assigned container chain: if it was running before it was in "syncing mode",

920

        // which doesn't use the correct ports, so start it in "collation mode".

921

13

        let id = current.unwrap();

922

13

        if running_chains_before.contains(&id) && !chains_to_stop.contains(&id) {

923

6

            chains_to_stop.push(id);

924

7

925

13

        if !chains_to_start.contains(&id) {

926

6

            chains_to_start.push(id);

927

7

928

22

929

930

35

    if need_to_restart_next {

931

        // Handle edge case of going from (2000, 2001) to (2001, 2000). In that case we must restart both chains,

932

        // because previously 2000 was collating and now 2000 will only be syncing.

933

2

        let id = next.unwrap();

934

2

        if running_chains_before.contains(&id) && !chains_to_stop.contains(&id) {

935

2

            chains_to_stop.push(id);

936

2

937

2

        if !chains_to_start.contains(&id) {

938

2

            chains_to_start.push(id);

939

2

940

33

941

942

    HandleUpdateAssignmentResult {

943

35

        chains_to_stop,

944

35

        chains_to_start,

945

35

        need_to_restart: need_to_restart_current || need_to_restart_next,

946

947

35

948

949

/// Select [SyncMode] to use for a container chain.

950

/// We want to use warp sync unless the db still exists, or the container chain is

951

/// still at genesis block (because of a warp sync bug in that case).

952

///

953

/// Remember that warp sync doesn't work if a partially synced database already exists, it falls

954

/// back to full sync instead. The only exception is if the previous instance of the database was

955

/// interrupted before it finished downloading the state, in that case the node will use warp sync.

956

/// If it was interrupted during the block history download, the node will use full sync but also

957

/// finish the block history download in the background, even if sync mode is set to full sync.

958

pub fn select_sync_mode_using_client(

959

    db_exists: bool,

960

    orchestrator_client: &Arc<ParachainClient>,

961

    container_chain_para_id: ParaId,

962

) -> sc_service::error::Result<SyncMode> {

963

    if db_exists {

964

        // If the user wants to use warp sync, they should have already removed the database

965

        return Ok(SyncMode::Full);

966

967

968

    // The following check is only needed because of this bug:

969

    // https://github.com/paritytech/polkadot-sdk/issues/1930

970

971

    let orchestrator_runtime_api = orchestrator_client.runtime_api();

972

    let orchestrator_chain_info = orchestrator_client.chain_info();

973

974

    // If the container chain is still at genesis block, use full sync because warp sync is broken

975

    let full_sync_needed = orchestrator_runtime_api

976

        .latest_author(orchestrator_chain_info.best_hash, container_chain_para_id)

977

        .map_err(|e| format!("Failed to read latest author: {}", e))?

978

        .is_none();

979

980

    if full_sync_needed {

981

        Ok(SyncMode::Full)

982

    } else {

983

        Ok(SyncMode::Warp)

984

985

986

987

async fn get_latest_container_block_number_from_orchestrator(

988

    orchestrator_chain_interface: &Arc<dyn OrchestratorChainInterface>,

989

    orchestrator_block_hash: PHash,

990

    container_chain_para_id: ParaId,

991

) -> Option<u32> {

992

    // Get the container chain's latest block from orchestrator chain and compare with client's one

993

    orchestrator_chain_interface

994

        .latest_block_number(orchestrator_block_hash, container_chain_para_id)

995

        .await

996

        .unwrap_or_default()

997

998

999

#[derive(Debug)]

1000

#[allow(dead_code)]

1001

enum DbRemovalReason {

1002

    HighBlockDiff {

1003

        best_block_number_db: u32,

1004

        best_block_number_onchain: u32,

1005

},

1006

    GenesisHashMismatch {

1007

        container_client_genesis_hash: H256,

1008

        chain_spec_genesis_hash_v0: H256,

1009

        chain_spec_genesis_hash_v1: H256,

1010

},

1011

1012

1013

/// Given a container chain client, check if the database is valid. If not, returns `Some` with the

1014

/// reason for db removal.

1015

/// Reasons may be:

1016

/// * High block diff: when the local db is outdated and it would take a long time to sync using full sync, we remove it to be able to use warp sync.

1017

/// * Genesis hash mismatch, when the chain was deregistered and a different chain with the same para id was registered.

1018

async fn db_needs_removal<RuntimeApi: MinimalContainerRuntimeApi>(

1019

    container_chain_client: &Arc<ContainerChainClient<RuntimeApi>>,

1020

    orchestrator_chain_interface: &Arc<dyn OrchestratorChainInterface>,

1021

    orchestrator_block_hash: PHash,

1022

    container_chain_para_id: ParaId,

1023

    container_chain_cli: &ContainerChainCli,

1024

    keep_db: bool,

1025

) -> sc_service::error::Result<Option<DbRemovalReason>> {

1026

    // Check block diff, only needed if keep-db is false

1027

    if !keep_db {

1028

        // Get latest block number from the container chain client

1029

        let last_container_block_temp = container_chain_client.chain_info().best_number;

1030

        if last_container_block_temp == 0 {

1031

            // Don't remove an empty database, as it may be in the process of a warp sync

1032

        } else if get_latest_container_block_number_from_orchestrator(

1033

            orchestrator_chain_interface,

1034

            orchestrator_block_hash,

1035

            container_chain_para_id,

1036

1037

        .await

1038

        .unwrap_or(0)

1039

        .abs_diff(last_container_block_temp)

1040

            > MAX_BLOCK_DIFF_FOR_FULL_SYNC

1041

1042

            // if the diff is big, delete db and restart using warp sync

1043

            return Ok(Some(DbRemovalReason::HighBlockDiff {

1044

                best_block_number_db: last_container_block_temp,

1045

                best_block_number_onchain: last_container_block_temp,

1046

            }));

1047

1048

1049

1050

    // Generate genesis hash to compare against container client's genesis hash

1051

    let container_preloaded_genesis = container_chain_cli.preloaded_chain_spec.as_ref().unwrap();

1052

1053

    // Check with both state versions, but first v1 which is the latest

1054

    let block_v1: Block =

1055

        generate_genesis_block(&**container_preloaded_genesis, sp_runtime::StateVersion::V1)

1056

            .map_err(|e| format!("{:?}", e))?;

1057

    let chain_spec_genesis_hash_v1 = block_v1.header().hash();

1058

1059

    let container_client_genesis_hash = container_chain_client.chain_info().genesis_hash;

1060

1061

    if container_client_genesis_hash != chain_spec_genesis_hash_v1 {

1062

        let block_v0: Block =

1063

            generate_genesis_block(&**container_preloaded_genesis, sp_runtime::StateVersion::V0)

1064

                .map_err(|e| format!("{:?}", e))?;

1065

        let chain_spec_genesis_hash_v0 = block_v0.header().hash();

1066

1067

        if container_client_genesis_hash != chain_spec_genesis_hash_v0 {

1068

            log::info!("Container genesis V0: {:?}", chain_spec_genesis_hash_v0);

1069

            log::info!("Container genesis V1: {:?}", chain_spec_genesis_hash_v1);

1070

            log::info!(

1071

                "Chain spec genesis {:?} did not match with any container genesis - Restarting...",

1072

                container_client_genesis_hash

1073

);

1074

            return Ok(Some(DbRemovalReason::GenesisHashMismatch {

1075

                container_client_genesis_hash,

1076

                chain_spec_genesis_hash_v0,

1077

                chain_spec_genesis_hash_v1,

1078

            }));

1079

1080

1081

1082

    Ok(None)

1083

1084

1085

/// Remove the container chain database folder. This is called with db_path:

1086

///     `Collator2002-01/data/containers/chains/simple_container_2002/paritydb/full-container-2002`

1087

/// but we want to delete everything under

1088

///     `Collator2002-01/data/containers/chains/simple_container_2002`

1089

/// So we use `delete_empty_folders_recursive` to try to remove the parent folders as well, but only

1090

/// if they are empty. This is to avoid removing any secret keys or other important data.

1091

fn delete_container_chain_db(db_path: &Path) {

1092

    // Remove folder `full-container-2002`

1093

    let _ = std::fs::remove_dir_all(db_path);

1094

    // Remove all the empty folders inside `simple_container_2002`, including self

1095

    if let Some(parent) = db_path.ancestors().nth(2) {

1096

        delete_empty_folders_recursive(parent);

1097

1098

1099

1100

/// Removes all empty folders in `path`, recursively. Then, if `path` is empty, it removes it as well.

1101

/// Ignores any IO errors.

1102

fn delete_empty_folders_recursive(path: &Path) {

1103

    let entry_iter = std::fs::read_dir(path);

1104

    let entry_iter = match entry_iter {

1105

        Ok(x) => x,

1106

        Err(_e) => return,

1107

};

1108

1109

    for entry in entry_iter {

1110

        let entry = match entry {

1111

            Ok(x) => x,

1112

            Err(_e) => continue,

1113

};

1114

1115

        let path = entry.path();

1116

        if path.is_dir() {

1117

            delete_empty_folders_recursive(&path);

1118

1119

1120

1121

    // Try to remove dir. Returns an error if the directory is not empty, but we ignore it.

1122

    let _ = std::fs::remove_dir(path);

1123

1124

1125

/// Parse a list of boot nodes in `Vec<u8>` format. Invalid boot nodes are filtered out.

1126

3

fn parse_boot_nodes_ignore_invalid(

1127

3

    boot_nodes_raw: Vec<Vec<u8>>,

1128

3

    container_chain_para_id: ParaId,

1129

3

) -> Vec<MultiaddrWithPeerId> {

1130

3

    boot_nodes_raw

1131

3

        .into_iter()

1132

3

        .filter_map(|x| {

1133

3

            let x = String::from_utf8(x)

1134

3

                .map_err(|e| {

1135

1

                    log::debug!(

1136

                        "Invalid boot node in container chain {}: {}",

1137

                        container_chain_para_id,

1138

1139

);

1140

3

})

1141

3

                .ok()?;

1142

1143

2

            x.parse::<MultiaddrWithPeerId>()

1144

2

                .map_err(|e| {

1145

1

                    log::debug!(

1146

                        "Invalid boot node in container chain {}: {}",

1147

                        container_chain_para_id,

1148

1149

1150

2

})

1151

2

                .ok()

1152

3

})

1153

3

        .collect()

1154

3

1155

1156

pub async fn wait_for_paritydb_lock(db_path: &Path, max_timeout: Duration) -> Result<(), String> {

1157

    let now = Instant::now();

1158

1159

    while now.elapsed() < max_timeout {

1160

        let lock_held = check_paritydb_lock_held(db_path)

1161

            .map_err(|e| format!("Failed to check if lock file is held: {}", e))?;

1162

        if !lock_held {

1163

            return Ok(());

1164

1165

        sleep(Duration::from_secs(1)).await;

1166

1167

1168

    Err("Timeout when waiting for paritydb lock".to_string())

1169

1170

1171

/// Given a path to a paritydb database, check if its lock file is held. This indicates that a

1172

/// background process is still using the database, so we should wait before trying to open it.

1173

///

1174

/// This should be kept up to date with the way paritydb handles the lock file:

1175

/// <https://github.com/paritytech/parity-db/blob/2b6820e310a08678d4540c044f41a93d87343ac8/src/db.rs#L215>

1176

fn check_paritydb_lock_held(db_path: &Path) -> Result<bool, std::io::Error> {

1177

    if !db_path.is_dir() {

1178

        // Lock file does not exist, so it is not held

1179

        return Ok(false);

1180

1181

1182

    let mut lock_path: std::path::PathBuf = db_path.to_owned();

1183

    lock_path.push("lock");

1184

    let lock_file = std::fs::OpenOptions::new()

1185

        .create(true)

1186

        .read(true)

1187

        .write(true)

1188

        .truncate(true)

1189

        .open(lock_path.as_path())?;

1190

    // Check if the lock file is busy by trying to lock it.

1191

    // Returns err if failed to adquire the lock.

1192

    let lock_held = lock_file.try_lock_exclusive().is_err();

1193

1194

    Ok(lock_held)

1195

1196

1197

fn sort_container_folders_by_para_id(

1198

    chains_folder_path: &Path,

1199

) -> HashMap<Option<ParaId>, Vec<PathBuf>> {

1200

    let mut h = HashMap::new();

1201

1202

    let entry_iter = std::fs::read_dir(chains_folder_path);

1203

    let entry_iter = match entry_iter {

1204

        Ok(x) => x,

1205

        Err(_e) => return h,

1206

};

1207

1208

    for entry in entry_iter {

1209

        let entry = match entry {

1210

            Ok(x) => x,

1211

            Err(_e) => continue,

1212

};

1213

1214

        let path = entry.path();

1215

        if path.is_dir() {

1216

            if let Ok(para_id) = process_container_folder_get_para_id(path.clone()) {

1217

                h.entry(para_id).or_default().push(path);

1218

1219

1220

1221

1222

1223

1224

1225

fn process_container_folder_get_para_id(path: PathBuf) -> std::io::Result<Option<ParaId>> {

1226

    // Build the path to the paritydb directory

1227

    let paritydb_path = path.join("paritydb");

1228

1229

    // Check if the paritydb directory exists and is a directory

1230

    if !paritydb_path.is_dir() {

1231

        // If not, associate the path with `None` in the hashmap

1232

        return Ok(None);

1233

1234

1235

    // Read the entries in the paritydb directory

1236

    let entry_iter = std::fs::read_dir(&paritydb_path)?;

1237

1238

    let mut para_id: Option<ParaId> = None;

1239

1240

    // Iterate over each entry in the paritydb directory

1241

    for entry in entry_iter {

1242

        let entry = entry?;

1243

        let sub_path = entry.path();

1244

1245

        // Only consider directories

1246

        if !sub_path.is_dir() {

1247

            continue;

1248

1249

1250

        let sub_path_file_name = match sub_path.file_name().and_then(|s| s.to_str()) {

1251

            Some(x) => x,

1252

            None => {

1253

                continue;

1254

1255

};

1256

1257

        // That follow this pattern

1258

        if !sub_path_file_name.starts_with("full-container-") {

1259

            continue;

1260

1261

1262

        if let Some(id) = parse_para_id_from_folder_name(sub_path_file_name) {

1263

            if para_id.is_some() {

1264

                // If there is more than one folder with a para id, assume this folder is

1265

                // corrupted and ignore it, keep it for manual deletion

1266

                return Err(std::io::Error::new(std::io::ErrorKind::Other, ""));

1267

1268

            para_id = Some(id);

1269

1270

1271

1272

    Ok(para_id)

1273

1274

1275

// Input:

1276

// full-container-2000

1277

// Output:

1278

// Some(2000)

1279

5

fn parse_para_id_from_folder_name(folder_name: &str) -> Option<ParaId> {

1280

    // Find last '-' in string

1281

5

    let idx = folder_name.rfind('-')?;

1282

    // +1 to skip the '-'

1283

3

    let id_str = &folder_name[idx + 1..];

1284

    // Try to parse as u32, in case of error return None

1285

3

    let id = id_str.parse::<u32>().ok()?;

1286

1287

1

    Some(id.into())

1288

5

1289

1290

#[cfg(test)]

1291

mod tests {

1292

    use {super::*, std::path::PathBuf};

1293

1294

    // Copy of ContainerChainSpawner with extra assertions for tests, and mocked spawn function.

1295

    struct MockContainerChainSpawner {

1296

        state: Arc<Mutex<ContainerChainSpawnerState>>,

1297

        orchestrator_para_id: ParaId,

1298

        collate_on_tanssi: Arc<

1299

            dyn Fn() -> (CancellationToken, futures::channel::oneshot::Receiver<()>) + Send + Sync,

1300

>,

1301

        collation_cancellation_constructs: Option<()>,

1302

        // Keep track of the last CollateOn message, for tests

1303

        currently_collating_on: Arc<Mutex<Option<ParaId>>>,

1304

1305

1306

    impl MockContainerChainSpawner {

1307

10

        fn new() -> Self {

1308

10

            let orchestrator_para_id = 1000.into();

1309

10

            // The node always starts as an orchestrator chain collator

1310

10

            let currently_collating_on = Arc::new(Mutex::new(Some(orchestrator_para_id)));

1311

10

            let currently_collating_on2 = currently_collating_on.clone();

1312

10

            let collate_closure = move || {

1313

3

                let mut cco = currently_collating_on2.lock().unwrap();

1314

3

                assert_ne!(

1315

3

                    *cco,

1316

3

                    Some(orchestrator_para_id),

1317

                    "Received CollateOn message when we were already collating on this chain: {}",

1318

                    orchestrator_para_id

1319

);

1320

3

                *cco = Some(orchestrator_para_id);

1321

3

                let (_, receiver) = futures::channel::oneshot::channel();

1322

3

                (CancellationToken::new(), receiver)

1323

3

};

1324

10

            let collate_on_tanssi: Arc<

1325

10

                dyn Fn() -> (CancellationToken, futures::channel::oneshot::Receiver<()>)

1326

10

                    + Send

1327

10

                    + Sync,

1328

10

            > = Arc::new(collate_closure);

1329

10

1330

10

            Self {

1331

10

                state: Arc::new(Mutex::new(ContainerChainSpawnerState {

1332

10

                    spawned_container_chains: Default::default(),

1333

10

                    assigned_para_id: Some(orchestrator_para_id),

1334

10

                    next_assigned_para_id: None,

1335

10

                    failed_para_ids: Default::default(),

1336

10

                    spawned_containers_monitor: Default::default(),

1337

10

                })),

1338

10

                orchestrator_para_id,

1339

10

                collate_on_tanssi,

1340

10

                // Some if collator starts on orchestrator chain

1341

10

                collation_cancellation_constructs: Some(()),

1342

10

                currently_collating_on,

1343

10

1344

10

1345

1346

21

        fn spawn(&self, container_chain_para_id: ParaId, start_collation: bool) {

1347

21

            let (signal, _on_exit) = oneshot::channel();

1348

21

            let currently_collating_on2 = self.currently_collating_on.clone();

1349

21

            let collate_closure = move || {

1350

13

                let mut cco = currently_collating_on2.lock().unwrap();

1351

13

                assert_ne!(

1352

13

                    *cco,

1353

13

                    Some(container_chain_para_id),

1354

                    "Received CollateOn message when we were already collating on this chain: {}",

1355

                    container_chain_para_id

1356

);

1357

13

                *cco = Some(container_chain_para_id);

1358

13

                let (_, receiver) = futures::channel::oneshot::channel();

1359

13

                (CancellationToken::new(), receiver)

1360

13

};

1361

21

            let collate_on: Arc<

1362

21

                dyn Fn() -> (CancellationToken, futures::channel::oneshot::Receiver<()>)

1363

21

                    + Send

1364

21

                    + Sync,

1365

21

            > = Arc::new(collate_closure);

1366

21

            // Dummy db_path for tests, is not actually used

1367

21

            let db_path = PathBuf::from(format!("/tmp/container-{}/db", container_chain_para_id));

1368

21

1369

21

            let old = self

1370

21

                .state

1371

21

                .lock()

1372

21

                .expect("poison error")

1373

21

                .spawned_container_chains

1374

21

                .insert(

1375

21

                    container_chain_para_id,

1376

21

                    ContainerChainState {

1377

21

                        stop_handle: StopContainerChain { signal, id: 0 },

1378

21

                        db_path,

1379

21

},

1380

21

);

1381

21

1382

21

            assert!(

1383

21

                old.is_none(),

1384

                "tried to spawn a container chain that was already running: {}",

1385

                container_chain_para_id

1386

);

1387

1388

21

            if start_collation {

1389

13

                let (_cancellation_token, _exit_receiver) = collate_on();

1390

13

1391

21

1392

1393

15

        fn stop(&self, container_chain_para_id: ParaId) {

1394

15

            let stop_handle = self

1395

15

                .state

1396

15

                .lock()

1397

15

                .expect("poison error")

1398

15

                .spawned_container_chains

1399

15

                .remove(&container_chain_para_id);

1400

15

1401

15

            match stop_handle {

1402

15

                Some(_stop_handle) => {

1403

15

                    log::info!("Stopping container chain {}", container_chain_para_id);

1404

1405

                None => {

1406

                    panic!(

1407

                        "Tried to stop a container chain that is not running: {}",

1408

                        container_chain_para_id

1409

);

1410

1411

1412

1413

            // Update currently_collating_on, if we stopped the chain we are no longer collating there

1414

15

            let mut lco = self.currently_collating_on.lock().unwrap();

1415

15

            if *lco == Some(container_chain_para_id) {

1416

7

                *lco = None;

1417

8

1418

15

1419

1420

35

        fn handle_update_assignment(&mut self, current: Option<ParaId>, next: Option<ParaId>) {

1421

35

            let HandleUpdateAssignmentResult {

1422

35

                chains_to_stop,

1423

35

                chains_to_start,

1424

35

                need_to_restart,

1425

35

            } = handle_update_assignment_state_change(

1426

35

                &mut self.state.lock().unwrap(),

1427

35

                self.orchestrator_para_id,

1428

35

                current,

1429

35

                next,

1430

35

);

1431

35

1432

35

            if current != Some(self.orchestrator_para_id) {

1433

                // If not assigned to orchestrator chain anymore, we need to stop the collator process

1434

27

                let mut cco = self.currently_collating_on.lock().unwrap();

1435

27

                if *cco == Some(self.orchestrator_para_id) {

1436

10

                    *cco = None;

1437

17

1438

27

                self.collation_cancellation_constructs = None;

1439

8

            } else if self.collation_cancellation_constructs.is_none() {

1440

3

                let (_cancellation_token, _exit_notification_receiver) = (self.collate_on_tanssi)();

1441

3

                self.collation_cancellation_constructs = Some(());

1442

5

1443

1444

            // Assert we never start and stop the same container chain

1445

56

            for para_id in &chains_to_start {

1446

21

                if !need_to_restart {

1447

4

                    assert!(

1448

4

                        !chains_to_stop.contains(para_id),

1449

                        "Tried to start and stop same container chain: {}",

1450

                        para_id

1451

);

1452

                } else {

1453

                    // Will try to start and stop container chain with id "current" or "next", so ignore that

1454

17

                    if Some(*para_id) != current && Some(*para_id) != next {

1455

                        assert!(

1456

                            !chains_to_stop.contains(para_id),

1457

                            "Tried to start and stop same container chain: {}",

1458

                            para_id

1459

);

1460

17

1461

1462

1463

            // Assert we never start or stop the orchestrator chain

1464

35

            assert!(!chains_to_start.contains(&self.orchestrator_para_id));

1465

35

            assert!(!chains_to_stop.contains(&self.orchestrator_para_id));

1466

1467

            // Stop all container chains that are no longer needed

1468

50

            for para_id in chains_to_stop {

1469

15

                self.stop(para_id);

1470

15

1471

1472

            // Start all new container chains (usually 1)

1473

56

            for para_id in chains_to_start {

1474

21

                // Edge case: when starting the node it may be assigned to a container chain, so we need to

1475

21

                // start a container chain already collating.

1476

21

                let start_collation = Some(para_id) == current;

1477

21

                self.spawn(para_id, start_collation);

1478

21

1479

1480

            // Assert that if we are currently assigned to a container chain, we are collating there

1481

35

            if let Some(para_id) = current {

1482

24

                self.assert_collating_on(Some(para_id));

1483

24

            } else {

1484

11

                self.assert_collating_on(None);

1485

11

1486

35

1487

1488

        #[track_caller]

1489

71

        fn assert_collating_on(&self, para_id: Option<ParaId>) {

1490

71

            let currently_collating_on = *self.currently_collating_on.lock().unwrap();

1491

71

            assert_eq!(currently_collating_on, para_id);

1492

71

1493

1494

        #[track_caller]

1495

36

        fn assert_running_chains(&self, para_ids: &[ParaId]) {

1496

36

            let mut actually_running: Vec<ParaId> = self

1497

36

                .state

1498

36

                .lock()

1499

36

                .unwrap()

1500

36

                .spawned_container_chains

1501

36

                .keys()

1502

36

                .cloned()

1503

36

                .collect();

1504

36

            actually_running.sort();

1505

36

            let mut should_be_running = para_ids.to_vec();

1506

36

            should_be_running.sort();

1507

36

            assert_eq!(actually_running, should_be_running);

1508

36

1509

1510

1511

    #[test]

1512

1

    fn starts_collating_on_tanssi() {

1513

1

        let mut m = MockContainerChainSpawner::new();

1514

1

        m.assert_collating_on(Some(1000.into()));

1515

1

        m.assert_running_chains(&[]);

1516

1

1517

1

        m.handle_update_assignment(None, None);

1518

1

        m.assert_collating_on(None);

1519

1

        m.assert_running_chains(&[]);

1520

1

1521

1522

    #[test]

1523

1

    fn assigned_to_orchestrator_chain() {

1524

1

        let mut m = MockContainerChainSpawner::new();

1525

1

1526

1

        m.handle_update_assignment(Some(1000.into()), Some(1000.into()));

1527

1

        m.assert_collating_on(Some(1000.into()));

1528

1

        m.assert_running_chains(&[]);

1529

1

1530

1

        m.handle_update_assignment(Some(1000.into()), None);

1531

1

        m.assert_collating_on(Some(1000.into()));

1532

1

        m.assert_running_chains(&[]);

1533

1

1534

1

        m.handle_update_assignment(None, None);

1535

1

        m.assert_collating_on(None);

1536

1

        m.assert_running_chains(&[]);

1537

1

1538

1

        m.handle_update_assignment(None, Some(1000.into()));

1539

1

        m.assert_collating_on(None);

1540

1

        m.assert_running_chains(&[]);

1541

1

1542

1

        m.handle_update_assignment(Some(1000.into()), Some(1000.into()));

1543

1

        m.assert_collating_on(Some(1000.into()));

1544

1

        m.assert_running_chains(&[]);

1545

1

1546

1547

    #[test]

1548

1

    fn assigned_to_container_chain() {

1549

1

        let mut m = MockContainerChainSpawner::new();

1550

1

1551

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1552

1

        m.assert_collating_on(Some(2000.into()));

1553

1

        m.assert_running_chains(&[2000.into()]);

1554

1

1555

1

        m.handle_update_assignment(Some(2000.into()), None);

1556

1

        m.assert_collating_on(Some(2000.into()));

1557

1

        m.assert_running_chains(&[2000.into()]);

1558

1

1559

1

        m.handle_update_assignment(None, None);

1560

1

        m.assert_collating_on(None);

1561

1

        m.assert_running_chains(&[]);

1562

1

1563

1

        m.handle_update_assignment(None, Some(2000.into()));

1564

1

        m.assert_collating_on(None);

1565

1

        m.assert_running_chains(&[2000.into()]);

1566

1

1567

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1568

1

        m.assert_collating_on(Some(2000.into()));

1569

1

        m.assert_running_chains(&[2000.into()]);

1570

1

1571

1572

    #[test]

1573

1

    fn spawn_container_chains() {

1574

1

        let mut m = MockContainerChainSpawner::new();

1575

1

1576

1

        m.handle_update_assignment(Some(1000.into()), Some(2000.into()));

1577

1

        m.assert_collating_on(Some(1000.into()));

1578

1

        m.assert_running_chains(&[2000.into()]);

1579

1

1580

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1581

1

        m.assert_collating_on(Some(2000.into()));

1582

1

        m.assert_running_chains(&[2000.into()]);

1583

1

1584

1

        m.handle_update_assignment(Some(2000.into()), Some(2001.into()));

1585

1

        m.assert_collating_on(Some(2000.into()));

1586

1

        m.assert_running_chains(&[2000.into(), 2001.into()]);

1587

1

1588

1

        m.handle_update_assignment(Some(2001.into()), Some(2001.into()));

1589

1

        m.assert_collating_on(Some(2001.into()));

1590

1

        m.assert_running_chains(&[2001.into()]);

1591

1

1592

1

        m.handle_update_assignment(Some(2001.into()), Some(1000.into()));

1593

1

        m.assert_collating_on(Some(2001.into()));

1594

1

        m.assert_running_chains(&[2001.into()]);

1595

1

1596

1

        m.handle_update_assignment(Some(1000.into()), Some(1000.into()));

1597

1

        m.assert_collating_on(Some(1000.into()));

1598

1

        m.assert_running_chains(&[]);

1599

1

1600

1601

    #[test]

1602

1

    fn swap_current_next() {

1603

1

        // Going from (2000, 2001) to (2001, 2000) shouldn't start or stop any container chains

1604

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1605

1

1606

1

        m.handle_update_assignment(Some(2000.into()), Some(2001.into()));

1607

1

        m.assert_collating_on(Some(2000.into()));

1608

1

        m.assert_running_chains(&[2000.into(), 2001.into()]);

1609

1

1610

1

        m.handle_update_assignment(Some(2001.into()), Some(2000.into()));

1611

1

        m.assert_collating_on(Some(2001.into()));

1612

1

        m.assert_running_chains(&[2000.into(), 2001.into()]);

1613

1

1614

1615

    #[test]

1616

1

    fn stop_collating_orchestrator() {

1617

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1618

1

1619

1

        m.handle_update_assignment(Some(1000.into()), Some(1000.into()));

1620

1

        m.assert_collating_on(Some(1000.into()));

1621

1

        m.assert_running_chains(&[]);

1622

1

1623

1

        m.handle_update_assignment(Some(1000.into()), None);

1624

1

        m.assert_collating_on(Some(1000.into()));

1625

1

        m.assert_running_chains(&[]);

1626

1

1627

1

        m.handle_update_assignment(None, None);

1628

1

        m.assert_collating_on(None);

1629

1

        m.assert_running_chains(&[]);

1630

1

1631

1

        m.handle_update_assignment(Some(1000.into()), None);

1632

1

        m.assert_collating_on(Some(1000.into()));

1633

1

        m.assert_running_chains(&[]);

1634

1

1635

1636

    #[test]

1637

1

    fn stop_collating_container() {

1638

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1639

1

1640

1

        m.handle_update_assignment(Some(2000.into()), None);

1641

1

        m.assert_collating_on(Some(2000.into()));

1642

1

        m.assert_running_chains(&[2000.into()]);

1643

1

1644

1

        m.handle_update_assignment(None, None);

1645

1

        m.assert_collating_on(None);

1646

1

        m.assert_running_chains(&[]);

1647

1

1648

1

        m.handle_update_assignment(None, Some(2000.into()));

1649

1

        m.assert_collating_on(None);

1650

1

        m.assert_running_chains(&[2000.into()]);

1651

1

1652

1

        // This will send a CollateOn message to the same chain as the last CollateOn,

1653

1

        // but this is needed because that chain has been stopped

1654

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1655

1

        m.assert_collating_on(Some(2000.into()));

1656

1

        m.assert_running_chains(&[2000.into()]);

1657

1

1658

1659

    #[test]

1660

1

    fn stop_collating_container_start_immediately() {

1661

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1662

1

1663

1

        m.handle_update_assignment(Some(2000.into()), None);

1664

1

        m.assert_collating_on(Some(2000.into()));

1665

1

        m.assert_running_chains(&[2000.into()]);

1666

1

1667

1

        m.handle_update_assignment(None, None);

1668

1

        m.assert_collating_on(None);

1669

1

        m.assert_running_chains(&[]);

1670

1

1671

1

        // This will start the chain already collating

1672

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1673

1

        m.assert_collating_on(Some(2000.into()));

1674

1

        m.assert_running_chains(&[2000.into()]);

1675

1

1676

1677

    #[test]

1678

1

    fn stop_all_chains() {

1679

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1680

1

1681

1

        m.handle_update_assignment(Some(2000.into()), Some(2001.into()));

1682

1

        m.assert_collating_on(Some(2000.into()));

1683

1

        m.assert_running_chains(&[2000.into(), 2001.into()]);

1684

1

1685

1

        m.handle_update_assignment(None, None);

1686

1

        m.assert_collating_on(None);

1687

1

        m.assert_running_chains(&[]);

1688

1

1689

1690

    #[test]

1691

1

    fn keep_collating_on_container() {

1692

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1693

1

1694

1

        m.handle_update_assignment(Some(2000.into()), None);

1695

1

        m.assert_collating_on(Some(2000.into()));

1696

1

        m.assert_running_chains(&[2000.into()]);

1697

1

1698

1

        m.handle_update_assignment(None, Some(2000.into()));

1699

1

        m.assert_collating_on(None);

1700

1

        m.assert_running_chains(&[2000.into()]);

1701

1

1702

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1703

1

        m.assert_collating_on(Some(2000.into()));

1704

1

        m.assert_running_chains(&[2000.into()]);

1705

1

1706

1707

    #[test]

1708

1

    fn invalid_boot_nodes_are_ignored() {

1709

1

        let para_id = 100.into();

1710

1

        let bootnode1 =

1711

1

            b"/ip4/127.0.0.1/tcp/33049/ws/p2p/12D3KooWHVMhQDHBpj9vQmssgyfspYecgV6e3hH1dQVDUkUbCYC9"

1712

1

                .to_vec();

1713

1

        assert_eq!(

1714

1

            parse_boot_nodes_ignore_invalid(vec![b"A".to_vec()], para_id),

1715

1

            vec![]

1716

1

);

1717

1

        assert_eq!(

1718

1

            parse_boot_nodes_ignore_invalid(vec![b"\xff".to_vec()], para_id),

1719

1

            vec![]

1720

1

);

1721

        // Valid boot nodes are not ignored

1722

1

        assert_eq!(

1723

1

            parse_boot_nodes_ignore_invalid(vec![bootnode1], para_id).len(),

1724

1

1725

1

);

1726

1

1727

1728

    #[test]

1729

1

    fn path_ancestors() {

1730

1

        // Test the implementation of `delete_container_chain_db`

1731

1

        let db_path = PathBuf::from("/tmp/zombienet/Collator2002-01/data/containers/chains/simple_container_2002/paritydb/full-container-2002");

1732

1

        let parent = db_path.ancestors().nth(2).unwrap();

1733

1

1734

1

        assert_eq!(

1735

1

            parent,

1736

1

            PathBuf::from(

1737

1

                "/tmp/zombienet/Collator2002-01/data/containers/chains/simple_container_2002"

1738

1

1739

1

1740

1

1741

1742

    #[test]

1743

1

    fn para_id_from_folder_name() {

1744

1

        assert_eq!(parse_para_id_from_folder_name(""), None,);

1745

1

        assert_eq!(parse_para_id_from_folder_name("full"), None,);

1746

1

        assert_eq!(parse_para_id_from_folder_name("full-container"), None,);

1747

1

        assert_eq!(parse_para_id_from_folder_name("full-container-"), None,);

1748

1

        assert_eq!(

1749

1

            parse_para_id_from_folder_name("full-container-2000"),

1750

1

            Some(ParaId::from(2000)),

1751

1

);

1752

1

1753