Grcov report - spawner.rs

1

// Copyright (C) Moondance Labs Ltd.

2

// This file is part of Tanssi.

3

4

// Tanssi is free software: you can redistribute it and/or modify

5

// it under the terms of the GNU General Public License as published by

6

// the Free Software Foundation, either version 3 of the License, or

7

// (at your option) any later version.

8

9

// Tanssi is distributed in the hope that it will be useful,

10

// but WITHOUT ANY WARRANTY; without even the implied warranty of

11

// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

12

// GNU General Public License for more details.

13

14

// You should have received a copy of the GNU General Public License

15

// along with Tanssi.  If not, see <http://www.gnu.org/licenses/>.

16

17

//! Container Chain Spawner

18

//!

19

//! Controls the starting and stopping of container chains.

20

//!

21

//! For more information about when the database is deleted, check the

22

//! [Keep db flowchart](https://raw.githubusercontent.com/moondance-labs/tanssi/master/docs/keep_db_flowchart.png)

23

24

use {

25

    crate::{

26

        cli::ContainerChainCli,

27

        monitor::{SpawnedContainer, SpawnedContainersMonitor},

28

        service::{start_node_impl_container, ContainerChainClient, ParachainClient},

29

},

30

    cumulus_primitives_core::ParaId,

31

    cumulus_relay_chain_interface::RelayChainInterface,

32

    dancebox_runtime::{opaque::Block as OpaqueBlock, Block},

33

    dc_orchestrator_chain_interface::{OrchestratorChainInterface, PHash},

34

    fs2::FileExt,

35

    futures::FutureExt,

36

    node_common::command::generate_genesis_block,

37

    pallet_author_noting_runtime_api::AuthorNotingApi,

38

    polkadot_primitives::CollatorPair,

39

    sc_cli::{Database, SyncMode},

40

    sc_network::config::MultiaddrWithPeerId,

41

    sc_service::SpawnTaskHandle,

42

    sc_transaction_pool::FullPool,

43

    sp_api::ProvideRuntimeApi,

44

    sp_core::H256,

45

    sp_keystore::KeystorePtr,

46

    sp_runtime::traits::Block as BlockT,

47

    std::{

48

        collections::{HashMap, HashSet},

49

        path::{Path, PathBuf},

50

        sync::{Arc, Mutex},

51

        time::Instant,

52

},

53

    tokio::{

54

        sync::{mpsc, oneshot},

55

        time::{sleep, Duration},

56

},

57

    tokio_util::sync::CancellationToken,

58

};

59

60

/// Timeout to wait for the database to close before starting it again, used in `wait_for_paritydb_lock`.

61

/// This is the max timeout, if the db is closed in 1 second then that function will only wait 1 second.

62

const MAX_DB_RESTART_TIMEOUT: Duration = Duration::from_secs(60);

63

64

/// Block diff threshold above which we decide it will be faster to delete the database and

65

/// use warp sync, rather than using full sync to download a large number of blocks.

66

/// This is only needed because warp sync does not support syncing from a state that is not

67

/// genesis, it falls back to full sync in that case.

68

/// 30_000 blocks = 50 hours at 6s/block.

69

/// Assuming a syncing speed of 100 blocks per second, this will take 5 minutes to sync.

70

const MAX_BLOCK_DIFF_FOR_FULL_SYNC: u32 = 30_000;

71

72

/// Task that handles spawning a stopping container chains based on assignment.

73

/// The main loop is [rx_loop](ContainerChainSpawner::rx_loop).

74

pub struct ContainerChainSpawner {

75

    /// Start container chain params

76

    pub params: ContainerChainSpawnParams,

77

78

    /// State

79

    pub state: Arc<Mutex<ContainerChainSpawnerState>>,

80

81

    /// Async callback that enables collation on the orchestrator chain

82

    pub collate_on_tanssi:

83

        Arc<dyn Fn() -> (CancellationToken, futures::channel::oneshot::Receiver<()>) + Send + Sync>,

84

    /// Stores the cancellation token used to stop the orchestrator chain collator process.

85

    /// When this is None, the orchestrator collator is not running.

86

    pub collation_cancellation_constructs:

87

        Option<(CancellationToken, futures::channel::oneshot::Receiver<()>)>,

88

89

90

/// Struct with all the params needed to start a container chain node given the CLI arguments,

91

/// and creating the ChainSpec from on-chain data from the orchestrator chain.

92

/// These params must be the same for all container chains, params that change such as the

93

/// `container_chain_para_id` should be passed as separate arguments to the [try_spawn] function.

94

///

95

/// This struct MUST NOT contain types (outside of `Option<CollationParams>`) obtained through

96

/// running an embeded orchestrator node, as this will prevent spawning a container chain in a node

97

/// connected to an orchestrator node through WebSocket.

98

#[derive(Clone)]

99

pub struct ContainerChainSpawnParams {

100

    pub orchestrator_chain_interface: Arc<dyn OrchestratorChainInterface>,

101

    pub container_chain_cli: ContainerChainCli,

102

    pub tokio_handle: tokio::runtime::Handle,

103

    pub chain_type: sc_chain_spec::ChainType,

104

    pub relay_chain: String,

105

    pub relay_chain_interface: Arc<dyn RelayChainInterface>,

106

    pub sync_keystore: KeystorePtr,

107

    pub orchestrator_para_id: ParaId,

108

    pub spawn_handle: SpawnTaskHandle,

109

    pub collation_params: Option<CollationParams>,

110

    pub data_preserver: bool,

111

112

113

/// Params specific to collation. This struct can contain types obtained through running an

114

/// embeded orchestrator node.

115

#[derive(Clone)]

116

pub struct CollationParams {

117

    pub collator_key: CollatorPair,

118

    pub orchestrator_tx_pool: Option<Arc<FullPool<OpaqueBlock, ParachainClient>>>,

119

    pub orchestrator_client: Option<Arc<ParachainClient>>,

120

    pub orchestrator_para_id: ParaId,

121

    /// If this is `false`, then `orchestrator_tx_pool` and `orchestrator_client` must be `Some`.

122

    pub solochain: bool,

123

124

125

/// Mutable state for container chain spawner. Keeps track of running chains.

126

#[derive(Default)]

127

pub struct ContainerChainSpawnerState {

128

    spawned_container_chains: HashMap<ParaId, ContainerChainState>,

129

    assigned_para_id: Option<ParaId>,

130

    next_assigned_para_id: Option<ParaId>,

131

    failed_para_ids: HashSet<ParaId>,

132

    // For debugging and detecting errors

133

    pub spawned_containers_monitor: SpawnedContainersMonitor,

134

135

136

pub struct ContainerChainState {

137

    /// Handle that can be used to stop the container chain

138

    stop_handle: StopContainerChain,

139

    /// Database path

140

    db_path: PathBuf,

141

142

143

/// Stops a container chain when signal is sent. The bool means `keep_db`, whether to keep the

144

/// container chain database (true) or remove it (false).

145

pub struct StopContainerChain {

146

    signal: oneshot::Sender<bool>,

147

    id: usize,

148

149

150

/// Messages used to control the `ContainerChainSpawner`. This is needed because one of the fields

151

/// of `ContainerChainSpawner` is not `Sync`, so we cannot simply pass an

152

/// `Arc<ContainerChainSpawner>` to other threads.

153

#[derive(Debug)]

154

pub enum CcSpawnMsg {

155

    /// Update container chain assignment

156

    UpdateAssignment {

157

        current: Option<ParaId>,

158

        next: Option<ParaId>,

159

},

160

161

162

// Separate function to allow using `?` to return a result, and also to avoid using `self` in an

163

// async function. Mutable state should be written by locking `state`.

164

// TODO: `state` should be an async mutex

165

async fn try_spawn(

166

    try_spawn_params: ContainerChainSpawnParams,

167

    state: Arc<Mutex<ContainerChainSpawnerState>>,

168

    container_chain_para_id: ParaId,

169

    start_collation: bool,

170

) -> sc_service::error::Result<()> {

171

    let ContainerChainSpawnParams {

172

        orchestrator_chain_interface,

173

        mut container_chain_cli,

174

        tokio_handle,

175

        chain_type,

176

        relay_chain,

177

        relay_chain_interface,

178

        sync_keystore,

179

        spawn_handle,

180

        mut collation_params,

181

        data_preserver,

182

..

183

    } = try_spawn_params;

184

    // Preload genesis data from orchestrator chain storage.

185

186

    // TODO: the orchestrator chain node may not be fully synced yet,

187

    // in that case we will be reading an old state.

188

    let orchestrator_block_hash = orchestrator_chain_interface

189

        .finalized_block_hash()

190

        .await

191

        .map_err(|e| format!("Failed to get latest block hash: {e}"))?;

192

193

    log::info!(

194

        "Detected assignment for container chain {}",

195

        container_chain_para_id

196

);

197

198

    let genesis_data = orchestrator_chain_interface

199

        .genesis_data(orchestrator_block_hash, container_chain_para_id)

200

        .await

201

        .map_err(|e| format!("Failed to call genesis_data runtime api: {}", e))?

202

        .ok_or_else(|| {

203

            format!(

204

                "No genesis data registered for container chain id {}",

205

                container_chain_para_id

206

207

        })?;

208

209

    let boot_nodes_raw = orchestrator_chain_interface

210

        .boot_nodes(orchestrator_block_hash, container_chain_para_id)

211

        .await

212

        .map_err(|e| format!("Failed to call boot_nodes runtime api: {}", e))?;

213

214

    if boot_nodes_raw.is_empty() {

215

        log::warn!(

216

            "No boot nodes registered on-chain for container chain {}",

217

            container_chain_para_id

218

);

219

220

    let boot_nodes = parse_boot_nodes_ignore_invalid(boot_nodes_raw, container_chain_para_id);

221

    if boot_nodes.is_empty() {

222

        log::warn!(

223

            "No valid boot nodes for container chain {}",

224

            container_chain_para_id

225

);

226

227

228

    container_chain_cli

229

        .preload_chain_spec_from_genesis_data(

230

            container_chain_para_id.into(),

231

            genesis_data,

232

            chain_type.clone(),

233

            relay_chain.clone(),

234

            boot_nodes,

235

236

        .map_err(|e| {

237

            format!(

238

                "failed to create container chain chain spec from on chain genesis data: {}",

239

240

241

        })?;

242

243

    log::info!(

244

        "Loaded chain spec for container chain {}",

245

        container_chain_para_id

246

);

247

248

    if !data_preserver && !start_collation {

249

        log::info!("This is a syncing container chain, using random ports");

250

251

        collation_params = None;

252

253

        // Use random ports to avoid conflicts with the other running container chain

254

        let random_ports = [23456, 23457, 23458];

255

256

        container_chain_cli

257

            .base

258

            .base

259

            .prometheus_params

260

            .prometheus_port = Some(random_ports[0]);

261

        container_chain_cli.base.base.network_params.port = Some(random_ports[1]);

262

        container_chain_cli.base.base.rpc_port = Some(random_ports[2]);

263

264

265

    let validator = collation_params.is_some();

266

267

    // Update CLI params

268

    container_chain_cli.base.para_id = Some(container_chain_para_id.into());

269

    container_chain_cli

270

        .base

271

        .base

272

        .import_params

273

        .database_params

274

        .database = Some(Database::ParityDb);

275

276

    let keep_db = container_chain_cli.base.keep_db;

277

278

    // Get a closure that checks if db_path exists.Need this to know when to use full sync instead of warp sync.

279

    let check_db_exists = {

280

        // Get db_path from config

281

        let mut container_chain_cli_config = sc_cli::SubstrateCli::create_configuration(

282

            &container_chain_cli,

283

            &container_chain_cli,

284

            tokio_handle.clone(),

285

286

        .map_err(|err| format!("Container chain argument error: {}", err))?;

287

288

        // Change database path to make it depend on container chain para id

289

        // So instead of the usual "db/full" we have "db/full-container-2000"

290

        let mut db_path = container_chain_cli_config

291

            .database

292

            .path()

293

            .ok_or_else(|| "Failed to get database path".to_string())?

294

            .to_owned();

295

        db_path.set_file_name(format!("full-container-{}", container_chain_para_id));

296

        container_chain_cli_config.database.set_path(&db_path);

297

298

        // Return a closure because we may need to check if the db exists multiple times

299

        move || db_path.exists()

300

};

301

302

    // Start container chain node. After starting, check if the database is good or needs to

303

    // be removed. If the db needs to be removed, this function will handle the node restart, and

304

    // return the components of a running container chain node.

305

    // This should be a separate function, but it has so many arguments that I prefer to have it as a closure for now

306

    let start_node_impl_container_with_restart = || async move {

307

        // Loop will run at most 2 times: 1 time if the db is good and 2 times if the db needs to be removed

308

        for _ in 0..2 {

309

            let db_existed_before = check_db_exists();

310

            container_chain_cli.base.base.network_params.sync = SyncMode::Warp;

311

            log::info!(

312

                "Container chain sync mode: {:?}",

313

                container_chain_cli.base.base.network_params.sync

314

);

315

316

            let mut container_chain_cli_config = sc_cli::SubstrateCli::create_configuration(

317

                &container_chain_cli,

318

                &container_chain_cli,

319

                tokio_handle.clone(),

320

321

            .map_err(|err| format!("Container chain argument error: {}", err))?;

322

323

            // Change database path to make it depend on container chain para id

324

            // So instead of the usual "db/full" we have "db/full-container-2000"

325

            let mut db_path = container_chain_cli_config

326

                .database

327

                .path()

328

                .ok_or_else(|| "Failed to get database path".to_string())?

329

                .to_owned();

330

            db_path.set_file_name(format!("full-container-{}", container_chain_para_id));

331

            container_chain_cli_config.database.set_path(&db_path);

332

333

            let (container_chain_task_manager, container_chain_client, container_chain_db) =

334

                start_node_impl_container(

335

                    container_chain_cli_config,

336

                    relay_chain_interface.clone(),

337

                    orchestrator_chain_interface.clone(),

338

                    sync_keystore.clone(),

339

                    container_chain_para_id,

340

                    collation_params.clone(),

341

342

                .await?;

343

344

            // Keep all node parts in one variable to make them easier to drop

345

            let node_parts = (

346

                container_chain_task_manager,

347

                container_chain_client,

348

                container_chain_db,

349

                db_path,

350

);

351

352

            if db_existed_before {

353

                // If the database already existed before, check if it can be used or it needs to be removed.

354

                // To remove the database, we restart the node, wait for the db to close to avoid a

355

                // "shutdown error" log, and then remove it.

356

                if let Some(db_removal_reason) = db_needs_removal(

357

                    &node_parts.1,

358

                    &orchestrator_chain_interface,

359

                    orchestrator_block_hash,

360

                    container_chain_para_id,

361

                    &container_chain_cli,

362

                    container_chain_cli.base.keep_db,

363

364

                .await?

365

366

                    let db_path = node_parts.3.clone();

367

                    // Important, drop `node_parts` before trying to `wait_for_paritydb_lock`

368

                    drop(node_parts);

369

                    // Wait here to for the database created in the previous loop iteration to close.

370

                    // Dropping is not enough because there is some background process that keeps the database open,

371

                    // so we check the paritydb lock file directly.

372

                    log::info!(

373

                        "Restarting container chain {} after db deletion. Reason: {:?}",

374

                        container_chain_para_id,

375

                        db_removal_reason,

376

);

377

                    wait_for_paritydb_lock(&db_path, MAX_DB_RESTART_TIMEOUT)

378

                        .await

379

                        .map_err(|e| {

380

                            log::warn!(

381

                                "Error waiting for chain {} to release db lock: {:?}",

382

                                container_chain_para_id,

383

384

);

385

386

387

                        })?;

388

                    delete_container_chain_db(&db_path);

389

390

                    // Recursion, will only happen once because `db_existed_before` will be false after

391

                    // removing the db. Apparently closures cannot be recursive so fake recursion by

392

                    // using a loop + continue

393

                    continue;

394

395

396

397

            // If using full sync, print a warning if the local db is at block 0 and the chain has thousands of blocks

398

            if container_chain_cli.base.base.network_params.sync == SyncMode::Full {

399

                let last_container_block_temp = node_parts.1.chain_info().best_number;

400

                let cc_block_num = get_latest_container_block_number_from_orchestrator(

401

                    &orchestrator_chain_interface,

402

                    orchestrator_block_hash,

403

                    container_chain_para_id,

404

405

                .await

406

                .unwrap_or(0);

407

                if last_container_block_temp == 0 && cc_block_num > MAX_BLOCK_DIFF_FOR_FULL_SYNC {

408

                    let db_folder = format!("full-container-{}", container_chain_para_id);

409

                    log::error!("\

410

                        Existing database for container chain {} is at block 0, assuming that warp sync failed.\n\

411

                        The node will now use full sync, which has to download {} blocks.\n\

412

                        If running as collator, it may not finish syncing on time and miss block rewards.\n\

413

                        To force using warp sync, stop tanssi-node and manually remove the db folder: {:?}\n\

414

                        ", container_chain_para_id, cc_block_num, db_folder)

415

416

417

418

            return sc_service::error::Result::Ok(node_parts);

419

420

421

        unreachable!("Above loop can run at most 2 times, and in the second iteration it is guaranteed to return")

422

};

423

424

    let (mut container_chain_task_manager, container_chain_client, container_chain_db, db_path) =

425

        start_node_impl_container_with_restart().await?;

426

427

    // Signal that allows to gracefully stop a container chain

428

    let (signal, on_exit) = oneshot::channel::<bool>();

429

430

    let monitor_id;

431

432

        let mut state = state.lock().expect("poison error");

433

434

        monitor_id = state.spawned_containers_monitor.push(SpawnedContainer {

435

            id: 0,

436

            para_id: container_chain_para_id,

437

            start_time: Instant::now(),

438

            stop_signal_time: None,

439

            stop_task_manager_time: None,

440

            stop_refcount_time: Default::default(),

441

            backend: Arc::downgrade(&container_chain_db),

442

            client: Arc::downgrade(&container_chain_client),

443

});

444

445

        if state

446

            .spawned_container_chains

447

            .contains_key(&container_chain_para_id)

448

449

            return Err(format!("Tried to spawn a container chain when another container chain with the same para id was already running: {:?}", container_chain_para_id).into());

450

451

        state.spawned_container_chains.insert(

452

            container_chain_para_id,

453

            ContainerChainState {

454

                stop_handle: StopContainerChain {

455

                    signal,

456

                    id: monitor_id,

457

},

458

                db_path: db_path.clone(),

459

},

460

);

461

462

463

    // Add the container chain task manager as a child task to the parent task manager.

464

    // We want to stop the node if this task manager stops, but we also want to allow a

465

    // graceful shutdown using the `on_exit` future.

466

    let name = "container-chain-task-manager";

467

    spawn_handle.spawn(name, None, async move {

468

        let mut container_chain_task_manager_future =

469

            container_chain_task_manager.future().fuse();

470

        let mut on_exit_future = on_exit.fuse();

471

472

        futures::select! {

473

            res1 = container_chain_task_manager_future => {

474

                // An essential task failed or the task manager was stopped unexpectedly

475

                // using `.terminate()`. This should stop the container chain but not the node.

476

                if res1.is_err() {

477

                    log::error!("Essential task failed in container chain {} task manager. Shutting down container chain service", container_chain_para_id);

478

                } else {

479

                    log::error!("Unexpected shutdown in container chain {} task manager. Shutting down container chain service", container_chain_para_id);

480

481

                // Mark this container chain as "failed to stop" to avoid warning in `self.stop()`

482

                let mut state = state.lock().expect("poison error");

483

                state.failed_para_ids.insert(container_chain_para_id);

484

                // Never delete db in this case because it is not a graceful shutdown

485

486

            stop_unassigned = on_exit_future => {

487

                // Graceful shutdown.

488

                // `stop_unassigned` will be `Ok(keep_db)` if `.stop()` has been called, which means that the

489

                // container chain has been unassigned, and will be `Err` if the handle has been dropped,

490

                // which means that the node is stopping.

491

                // Delete existing database if running as collator

492

                if validator && stop_unassigned == Ok(false) && !keep_db {

493

                    // If this breaks after a code change, make sure that all the variables that

494

                    // may keep the chain alive are dropped before the call to `wait_for_paritydb_lock`.

495

                    drop(container_chain_task_manager_future);

496

                    drop(container_chain_task_manager);

497

                    let db_closed = wait_for_paritydb_lock(&db_path, MAX_DB_RESTART_TIMEOUT)

498

                        .await

499

                        .map_err(|e| {

500

                            log::warn!(

501

                                "Error waiting for chain {} to release db lock: {:?}",

502

                                container_chain_para_id,

503

504

);

505

                        }).is_ok();

506

                    // If db has not closed in 60 seconds we do not delete it.

507

                    if db_closed {

508

                        delete_container_chain_db(&db_path);

509

510

511

512

513

514

        let mut state = state.lock().expect("poison error");

515

        state

516

            .spawned_containers_monitor

517

            .set_stop_task_manager_time(monitor_id, Instant::now());

518

});

519

520

    Ok(())

521

522

523

/// Interface for spawning and stopping container chain embeded nodes.

524

pub trait Spawner {

525

    /// Access to the Orchestrator Chain Interface

526

    fn orchestrator_chain_interface(&self) -> Arc<dyn OrchestratorChainInterface>;

527

528

    /// Try to start a new container chain. In case of an error, this does not stop the node, and

529

    /// the container chain will be attempted to spawn again when the collator is reassigned to it.

530

///

531

    /// It is possible that we try to spawn-stop-spawn the same chain, and the second spawn fails

532

    /// because the chain has not stopped yet, because `stop` does not wait for the chain to stop,

533

    /// so before calling `spawn` make sure to call `wait_for_paritydb_lock` before, like we do in

534

    /// `handle_update_assignment`.

535

    fn spawn(

536

        &self,

537

        container_chain_para_id: ParaId,

538

        start_collation: bool,

539

    ) -> impl std::future::Future<Output = ()> + Send;

540

541

    /// Stop a container chain. Prints a warning if the container chain was not running.

542

    /// Returns the database path for the container chain, can be used with `wait_for_paritydb_lock`

543

    /// to ensure that the container chain has fully stopped. The database path can be `None` if the

544

    /// chain was not running.

545

    fn stop(&self, container_chain_para_id: ParaId, keep_db: bool) -> Option<PathBuf>;

546

547

548

impl Spawner for ContainerChainSpawner {

549

    /// Access to the Orchestrator Chain Interface

550

    fn orchestrator_chain_interface(&self) -> Arc<dyn OrchestratorChainInterface> {

551

        self.params.orchestrator_chain_interface.clone()

552

553

554

    /// Try to start a new container chain. In case of an error, this does not stop the node, and

555

    /// the container chain will be attempted to spawn again when the collator is reassigned to it.

556

///

557

    /// It is possible that we try to spawn-stop-spawn the same chain, and the second spawn fails

558

    /// because the chain has not stopped yet, because `stop` does not wait for the chain to stop,

559

    /// so before calling `spawn` make sure to call `wait_for_paritydb_lock` before, like we do in

560

    /// `handle_update_assignment`.

561

    async fn spawn(&self, container_chain_para_id: ParaId, start_collation: bool) {

562

        let try_spawn_params = self.params.clone();

563

        let state = self.state.clone();

564

        let state2 = state.clone();

565

566

        match try_spawn(

567

            try_spawn_params,

568

            state,

569

            container_chain_para_id,

570

            start_collation,

571

572

        .await

573

574

            Ok(()) => {}

575

            Err(e) => {

576

                log::error!(

577

                    "Failed to start container chain {}: {}",

578

                    container_chain_para_id,

579

580

);

581

                // Mark this container chain as "failed to start"

582

                let mut state = state2.lock().expect("poison error");

583

                state.failed_para_ids.insert(container_chain_para_id);

584

585

586

587

588

    /// Stop a container chain. Prints a warning if the container chain was not running.

589

    /// Returns the database path for the container chain, can be used with `wait_for_paritydb_lock`

590

    /// to ensure that the container chain has fully stopped. The database path can be `None` if the

591

    /// chain was not running.

592

    fn stop(&self, container_chain_para_id: ParaId, keep_db: bool) -> Option<PathBuf> {

593

        let mut state = self.state.lock().expect("poison error");

594

        let stop_handle = state

595

            .spawned_container_chains

596

            .remove(&container_chain_para_id);

597

598

        match stop_handle {

599

            Some(stop_handle) => {

600

                log::info!("Stopping container chain {}", container_chain_para_id);

601

602

                let id = stop_handle.stop_handle.id;

603

                state

604

                    .spawned_containers_monitor

605

                    .set_stop_signal_time(id, Instant::now());

606

607

                // Send signal to perform graceful shutdown, which will delete the db if needed

608

                let _ = stop_handle.stop_handle.signal.send(keep_db);

609

610

                Some(stop_handle.db_path)

611

612

            None => {

613

                // Do not print the warning message if this is a container chain that has failed to

614

                // start, because in that case it will not be running

615

                if !state.failed_para_ids.remove(&container_chain_para_id) {

616

                    log::warn!(

617

                        "Tried to stop a container chain that is not running: {}",

618

                        container_chain_para_id

619

);

620

621

622

                None

623

624

625

626

627

628

impl ContainerChainSpawner {

629

    /// Receive and process `CcSpawnMsg`s indefinitely

630

    pub async fn rx_loop(

631

        mut self,

632

        mut rx: mpsc::UnboundedReceiver<CcSpawnMsg>,

633

        validator: bool,

634

        solochain: bool,

635

) {

636

        // The node always starts as an orchestrator chain collator.

637

        // This is because the assignment is detected after importing a new block, so if all

638

        // collators stop at the same time, when they start again nobody will produce the new block.

639

        // So all nodes start as orchestrator chain collators, until the first block is imported,

640

        // then the real assignment is used.

641

        // Except in solochain mode, then the initial assignment is None.

642

        if validator && !solochain {

643

            self.handle_update_assignment(Some(self.params.orchestrator_para_id), None)

644

                .await;

645

646

647

        while let Some(msg) = rx.recv().await {

648

            match msg {

649

                CcSpawnMsg::UpdateAssignment { current, next } => {

650

                    self.handle_update_assignment(current, next).await;

651

652

653

654

655

        // The while loop can end if all the senders get dropped, but since this is an

656

        // essential task we don't want it to stop. So await a future that never completes.

657

        // This should only happen when starting a full node.

658

        if !validator {

659

            let () = std::future::pending().await;

660

661

662

663

    /// Handle `CcSpawnMsg::UpdateAssignment`

664

    async fn handle_update_assignment(&mut self, current: Option<ParaId>, next: Option<ParaId>) {

665

        let HandleUpdateAssignmentResult {

666

            chains_to_stop,

667

            chains_to_start,

668

            need_to_restart: _,

669

        } = handle_update_assignment_state_change(

670

            &mut self.state.lock().expect("poison error"),

671

            self.params.orchestrator_para_id,

672

            current,

673

            next,

674

);

675

676

        if current != Some(self.params.orchestrator_para_id) {

677

            // If not assigned to orchestrator chain anymore, we need to stop the collator process

678

            let maybe_exit_notification_receiver = self

679

                .collation_cancellation_constructs

680

                .take()

681

                .map(|(cancellation_token, exit_notification_receiver)| {

682

                    cancellation_token.cancel();

683

                    exit_notification_receiver

684

});

685

686

            if let Some(exit_notification_receiver) = maybe_exit_notification_receiver {

687

                let _ = exit_notification_receiver.await;

688

689

        } else if self.collation_cancellation_constructs.is_none() {

690

            // If assigned to orchestrator chain but the collator process is not running, start it

691

            self.collation_cancellation_constructs = Some((self.collate_on_tanssi)());

692

693

694

        // Stop all container chains that are no longer needed

695

        let mut db_paths_restart = vec![];

696

        for para_id in chains_to_stop {

697

            // Keep db if we are currently assigned to this chain

698

            let keep_db = Some(para_id) == current;

699

            let maybe_db_path = self.stop(para_id, keep_db);

700

            // If we are restarting this chain, save its db_path to check when it actually stopped

701

            if let Some(db_path) = maybe_db_path {

702

                if chains_to_start.contains(&para_id) {

703

                    db_paths_restart.push((para_id, db_path));

704

705

706

707

708

        if !db_paths_restart.is_empty() {

709

            // Ensure the chains we stopped actually stopped by checking if their database is unlocked.

710

            // Using `join_all` because in one edge case we may be restarting 2 chains,

711

            // but almost always this will be only one future.

712

            let futs = db_paths_restart

713

                .into_iter()

714

                .map(|(para_id, db_path)| async move {

715

                    wait_for_paritydb_lock(&db_path, MAX_DB_RESTART_TIMEOUT)

716

                        .await

717

                        .map_err(|e| {

718

                            log::warn!(

719

                                "Error waiting for chain {} to release db lock: {:?}",

720

                                para_id,

721

722

);

723

})

724

});

725

            futures::future::join_all(futs).await;

726

727

728

        // Start all new container chains (usually 1)

729

        for para_id in chains_to_start {

730

            // Edge case: when starting the node it may be assigned to a container chain, so we need to

731

            // start a container chain already collating.

732

            // TODO: another edge case: if current == None, and running_chains == 0,

733

            // and chains_to_start == 1, we can start this chain as collating, and we won't need

734

            // to restart it on the next session. We need to add some extra state somewhere to

735

            // implement this properly.

736

            let start_collation = Some(para_id) == current;

737

            self.spawn(para_id, start_collation).await;

738

739

740

741

742

struct HandleUpdateAssignmentResult {

743

    chains_to_stop: Vec<ParaId>,

744

    chains_to_start: Vec<ParaId>,

745

    #[allow(dead_code)] // no longer used except in tests

746

    need_to_restart: bool,

747

748

749

// This is a separate function to allow testing

750

35

fn handle_update_assignment_state_change(

751

35

    state: &mut ContainerChainSpawnerState,

752

35

    orchestrator_para_id: ParaId,

753

35

    current: Option<ParaId>,

754

35

    next: Option<ParaId>,

755

35

) -> HandleUpdateAssignmentResult {

756

35

    if (state.assigned_para_id, state.next_assigned_para_id) == (current, next) {

757

        // If nothing changed there is nothing to update

758

        return HandleUpdateAssignmentResult {

759

            chains_to_stop: Default::default(),

760

            chains_to_start: Default::default(),

761

            need_to_restart: false,

762

};

763

35

764

35

765

35

    // Create a set with the container chains that were running before, and the container

766

35

    // chains that should be running after the updated assignment. This is used to calculate

767

35

    // the difference, and stop and start the required container chains.

768

35

    let mut running_chains_before = HashSet::new();

769

35

    let mut running_chains_after = HashSet::new();

770

35

771

35

    running_chains_before.extend(state.assigned_para_id);

772

35

    running_chains_before.extend(state.next_assigned_para_id);

773

35

    // Ignore orchestrator_para_id because it is handled in a special way, as it does not need to

774

35

    // start one session before in order to sync.

775

35

    running_chains_before.remove(&orchestrator_para_id);

776

35

777

35

    running_chains_after.extend(current);

778

35

    running_chains_after.extend(next);

779

35

    running_chains_after.remove(&orchestrator_para_id);

780

35

    let mut need_to_restart_current = false;

781

35

    let mut need_to_restart_next = false;

782

35

783

35

    if state.assigned_para_id != current {

784

24

        if let Some(para_id) = current {

785

            // If the assigned container chain has changed, we may need to

786

            // restart it in collation mode, unless it is the orchestrator chain.

787

16

            if para_id != orchestrator_para_id {

788

13

                need_to_restart_current = true;

789

13

790

8

791

792

24

        if let Some(para_id) = state.assigned_para_id {

793

18

            if para_id != orchestrator_para_id && Some(para_id) == next {

794

2

                need_to_restart_next = true;

795

16

796

6

797

11

798

799

35

    state.assigned_para_id = current;

800

35

    state.next_assigned_para_id = next;

801

35

802

35

    let mut chains_to_stop: Vec<_> = running_chains_before

803

35

        .difference(&running_chains_after)

804

35

        .copied()

805

35

        .collect();

806

35

    let mut chains_to_start: Vec<_> = running_chains_after

807

35

        .difference(&running_chains_before)

808

35

        .copied()

809

35

        .collect();

810

35

811

35

    if need_to_restart_current {

812

        // Force restart of new assigned container chain: if it was running before it was in "syncing mode",

813

        // which doesn't use the correct ports, so start it in "collation mode".

814

13

        let id = current.unwrap();

815

13

        if running_chains_before.contains(&id) && !chains_to_stop.contains(&id) {

816

6

            chains_to_stop.push(id);

817

7

818

13

        if !chains_to_start.contains(&id) {

819

6

            chains_to_start.push(id);

820

7

821

22

822

823

35

    if need_to_restart_next {

824

        // Handle edge case of going from (2000, 2001) to (2001, 2000). In that case we must restart both chains,

825

        // because previously 2000 was collating and now 2000 will only be syncing.

826

2

        let id = next.unwrap();

827

2

        if running_chains_before.contains(&id) && !chains_to_stop.contains(&id) {

828

2

            chains_to_stop.push(id);

829

2

830

2

        if !chains_to_start.contains(&id) {

831

2

            chains_to_start.push(id);

832

2

833

33

834

835

    HandleUpdateAssignmentResult {

836

35

        chains_to_stop,

837

35

        chains_to_start,

838

35

        need_to_restart: need_to_restart_current || need_to_restart_next,

839

840

35

841

842

/// Select [SyncMode] to use for a container chain.

843

/// We want to use warp sync unless the db still exists, or the container chain is

844

/// still at genesis block (because of a warp sync bug in that case).

845

///

846

/// Remember that warp sync doesn't work if a partially synced database already exists, it falls

847

/// back to full sync instead. The only exception is if the previous instance of the database was

848

/// interrupted before it finished downloading the state, in that case the node will use warp sync.

849

/// If it was interrupted during the block history download, the node will use full sync but also

850

/// finish the block history download in the background, even if sync mode is set to full sync.

851

pub fn select_sync_mode_using_client(

852

    db_exists: bool,

853

    orchestrator_client: &Arc<ParachainClient>,

854

    container_chain_para_id: ParaId,

855

) -> sc_service::error::Result<SyncMode> {

856

    if db_exists {

857

        // If the user wants to use warp sync, they should have already removed the database

858

        return Ok(SyncMode::Full);

859

860

861

    // The following check is only needed because of this bug:

862

    // https://github.com/paritytech/polkadot-sdk/issues/1930

863

864

    let orchestrator_runtime_api = orchestrator_client.runtime_api();

865

    let orchestrator_chain_info = orchestrator_client.chain_info();

866

867

    // If the container chain is still at genesis block, use full sync because warp sync is broken

868

    let full_sync_needed = orchestrator_runtime_api

869

        .latest_author(orchestrator_chain_info.best_hash, container_chain_para_id)

870

        .map_err(|e| format!("Failed to read latest author: {}", e))?

871

        .is_none();

872

873

    if full_sync_needed {

874

        Ok(SyncMode::Full)

875

    } else {

876

        Ok(SyncMode::Warp)

877

878

879

880

async fn get_latest_container_block_number_from_orchestrator(

881

    orchestrator_chain_interface: &Arc<dyn OrchestratorChainInterface>,

882

    orchestrator_block_hash: PHash,

883

    container_chain_para_id: ParaId,

884

) -> Option<u32> {

885

    // Get the container chain's latest block from orchestrator chain and compare with client's one

886

887

    orchestrator_chain_interface

888

        .latest_block_number(orchestrator_block_hash, container_chain_para_id)

889

        .await

890

        .unwrap_or_default()

891

892

893

#[derive(Debug)]

894

#[allow(dead_code)]

895

enum DbRemovalReason {

896

    HighBlockDiff {

897

        best_block_number_db: u32,

898

        best_block_number_onchain: u32,

899

},

900

    GenesisHashMismatch {

901

        container_client_genesis_hash: H256,

902

        chain_spec_genesis_hash_v0: H256,

903

        chain_spec_genesis_hash_v1: H256,

904

},

905

906

907

/// Given a container chain client, check if the database is valid. If not, returns `Some` with the

908

/// reason for db removal.

909

/// Reasons may be:

910

/// * High block diff: when the local db is outdated and it would take a long time to sync using full sync, we remove it to be able to use warp sync.

911

/// * Genesis hash mismatch, when the chain was deregistered and a different chain with the same para id was registered.

912

async fn db_needs_removal(

913

    container_chain_client: &Arc<ContainerChainClient>,

914

    orchestrator_chain_interface: &Arc<dyn OrchestratorChainInterface>,

915

    orchestrator_block_hash: PHash,

916

    container_chain_para_id: ParaId,

917

    container_chain_cli: &ContainerChainCli,

918

    keep_db: bool,

919

) -> sc_service::error::Result<Option<DbRemovalReason>> {

920

    // Check block diff, only needed if keep-db is false

921

    if !keep_db {

922

        // Get latest block number from the container chain client

923

        let last_container_block_temp = container_chain_client.chain_info().best_number;

924

        if last_container_block_temp == 0 {

925

            // Don't remove an empty database, as it may be in the process of a warp sync

926

        } else if get_latest_container_block_number_from_orchestrator(

927

            orchestrator_chain_interface,

928

            orchestrator_block_hash,

929

            container_chain_para_id,

930

931

        .await

932

        .unwrap_or(0)

933

        .abs_diff(last_container_block_temp)

934

            > MAX_BLOCK_DIFF_FOR_FULL_SYNC

935

936

            // if the diff is big, delete db and restart using warp sync

937

            return Ok(Some(DbRemovalReason::HighBlockDiff {

938

                best_block_number_db: last_container_block_temp,

939

                best_block_number_onchain: last_container_block_temp,

940

            }));

941

942

943

944

    // Generate genesis hash to compare against container client's genesis hash

945

    let container_preloaded_genesis = container_chain_cli.preloaded_chain_spec.as_ref().unwrap();

946

947

    // Check with both state versions, but first v1 which is the latest

948

    let block_v1: Block =

949

        generate_genesis_block(&**container_preloaded_genesis, sp_runtime::StateVersion::V1)

950

            .map_err(|e| format!("{:?}", e))?;

951

    let chain_spec_genesis_hash_v1 = block_v1.header().hash();

952

953

    let container_client_genesis_hash = container_chain_client.chain_info().genesis_hash;

954

955

    if container_client_genesis_hash != chain_spec_genesis_hash_v1 {

956

        let block_v0: Block =

957

            generate_genesis_block(&**container_preloaded_genesis, sp_runtime::StateVersion::V0)

958

                .map_err(|e| format!("{:?}", e))?;

959

        let chain_spec_genesis_hash_v0 = block_v0.header().hash();

960

961

        if container_client_genesis_hash != chain_spec_genesis_hash_v0 {

962

            log::info!("Container genesis V0: {:?}", chain_spec_genesis_hash_v0);

963

            log::info!("Container genesis V1: {:?}", chain_spec_genesis_hash_v1);

964

            log::info!(

965

                "Chain spec genesis {:?} did not match with any container genesis - Restarting...",

966

                container_client_genesis_hash

967

);

968

            return Ok(Some(DbRemovalReason::GenesisHashMismatch {

969

                container_client_genesis_hash,

970

                chain_spec_genesis_hash_v0,

971

                chain_spec_genesis_hash_v1,

972

            }));

973

974

975

976

    Ok(None)

977

978

979

/// Remove the container chain database folder. This is called with db_path:

980

///     `Collator2002-01/data/containers/chains/simple_container_2002/paritydb/full-container-2002`

981

/// but we want to delete everything under

982

///     `Collator2002-01/data/containers/chains/simple_container_2002`

983

/// So we use `delete_empty_folders_recursive` to try to remove the parent folders as well, but only

984

/// if they are empty. This is to avoid removing any secret keys or other important data.

985

fn delete_container_chain_db(db_path: &Path) {

986

    // Remove folder `full-container-2002`

987

    let _ = std::fs::remove_dir_all(db_path);

988

    // Remove all the empty folders inside `simple_container_2002`, including self

989

    if let Some(parent) = db_path.ancestors().nth(2) {

990

        delete_empty_folders_recursive(parent);

991

992

993

994

/// Removes all empty folders in `path`, recursively. Then, if `path` is empty, it removes it as well.

995

/// Ignores any IO errors.

996

fn delete_empty_folders_recursive(path: &Path) {

997

    let entry_iter = std::fs::read_dir(path);

998

    let entry_iter = match entry_iter {

999

        Ok(x) => x,

1000

        Err(_e) => return,

1001

};

1002

1003

    for entry in entry_iter {

1004

        let entry = match entry {

1005

            Ok(x) => x,

1006

            Err(_e) => continue,

1007

};

1008

1009

        let path = entry.path();

1010

        if path.is_dir() {

1011

            delete_empty_folders_recursive(&path);

1012

1013

1014

1015

    // Try to remove dir. Returns an error if the directory is not empty, but we ignore it.

1016

    let _ = std::fs::remove_dir(path);

1017

1018

1019

/// Parse a list of boot nodes in `Vec<u8>` format. Invalid boot nodes are filtered out.

1020

3

fn parse_boot_nodes_ignore_invalid(

1021

3

    boot_nodes_raw: Vec<Vec<u8>>,

1022

3

    container_chain_para_id: ParaId,

1023

3

) -> Vec<MultiaddrWithPeerId> {

1024

3

    boot_nodes_raw

1025

3

        .into_iter()

1026

3

        .filter_map(|x| {

1027

3

            let x = String::from_utf8(x)

1028

3

                .map_err(|e| {

1029

1

                    log::debug!(

1030

                        "Invalid boot node in container chain {}: {}",

1031

                        container_chain_para_id,

1032

1033

);

1034

3

})

1035

3

                .ok()?;

1036

1037

2

            x.parse::<MultiaddrWithPeerId>()

1038

2

                .map_err(|e| {

1039

1

                    log::debug!(

1040

                        "Invalid boot node in container chain {}: {}",

1041

                        container_chain_para_id,

1042

1043

1044

2

})

1045

2

                .ok()

1046

3

})

1047

3

        .collect()

1048

3

1049

1050

pub async fn wait_for_paritydb_lock(db_path: &Path, max_timeout: Duration) -> Result<(), String> {

1051

    let now = Instant::now();

1052

1053

    while now.elapsed() < max_timeout {

1054

        let lock_held = check_paritydb_lock_held(db_path)

1055

            .map_err(|e| format!("Failed to check if lock file is held: {}", e))?;

1056

        if !lock_held {

1057

            return Ok(());

1058

1059

        sleep(Duration::from_secs(1)).await;

1060

1061

1062

    Err("Timeout when waiting for paritydb lock".to_string())

1063

1064

1065

/// Given a path to a paritydb database, check if its lock file is held. This indicates that a

1066

/// background process is still using the database, so we should wait before trying to open it.

1067

///

1068

/// This should be kept up to date with the way paritydb handles the lock file:

1069

/// <https://github.com/paritytech/parity-db/blob/2b6820e310a08678d4540c044f41a93d87343ac8/src/db.rs#L215>

1070

fn check_paritydb_lock_held(db_path: &Path) -> Result<bool, std::io::Error> {

1071

    if !db_path.is_dir() {

1072

        // Lock file does not exist, so it is not held

1073

        return Ok(false);

1074

1075

1076

    let mut lock_path: std::path::PathBuf = db_path.to_owned();

1077

    lock_path.push("lock");

1078

    let lock_file = std::fs::OpenOptions::new()

1079

        .create(true)

1080

        .read(true)

1081

        .write(true)

1082

        .truncate(true)

1083

        .open(lock_path.as_path())?;

1084

    // Check if the lock file is busy by trying to lock it.

1085

    // Returns err if failed to adquire the lock.

1086

    let lock_held = lock_file.try_lock_exclusive().is_err();

1087

1088

    Ok(lock_held)

1089

1090

1091

#[cfg(test)]

1092

mod tests {

1093

    use {super::*, std::path::PathBuf};

1094

1095

    // Copy of ContainerChainSpawner with extra assertions for tests, and mocked spawn function.

1096

    struct MockContainerChainSpawner {

1097

        state: Arc<Mutex<ContainerChainSpawnerState>>,

1098

        orchestrator_para_id: ParaId,

1099

        collate_on_tanssi: Arc<

1100

            dyn Fn() -> (CancellationToken, futures::channel::oneshot::Receiver<()>) + Send + Sync,

1101

>,

1102

        collation_cancellation_constructs: Option<()>,

1103

        // Keep track of the last CollateOn message, for tests

1104

        currently_collating_on: Arc<Mutex<Option<ParaId>>>,

1105

1106

1107

    impl MockContainerChainSpawner {

1108

10

        fn new() -> Self {

1109

10

            let orchestrator_para_id = 1000.into();

1110

10

            // The node always starts as an orchestrator chain collator

1111

10

            let currently_collating_on = Arc::new(Mutex::new(Some(orchestrator_para_id)));

1112

10

            let currently_collating_on2 = currently_collating_on.clone();

1113

10

            let collate_closure = move || {

1114

3

                let mut cco = currently_collating_on2.lock().unwrap();

1115

3

                assert_ne!(

1116

3

                    *cco,

1117

3

                    Some(orchestrator_para_id),

1118

                    "Received CollateOn message when we were already collating on this chain: {}",

1119

                    orchestrator_para_id

1120

);

1121

3

                *cco = Some(orchestrator_para_id);

1122

3

                let (_, receiver) = futures::channel::oneshot::channel();

1123

3

                (CancellationToken::new(), receiver)

1124

3

};

1125

10

            let collate_on_tanssi: Arc<

1126

10

                dyn Fn() -> (CancellationToken, futures::channel::oneshot::Receiver<()>)

1127

10

                    + Send

1128

10

                    + Sync,

1129

10

            > = Arc::new(collate_closure);

1130

10

1131

10

            Self {

1132

10

                state: Arc::new(Mutex::new(ContainerChainSpawnerState {

1133

10

                    spawned_container_chains: Default::default(),

1134

10

                    assigned_para_id: Some(orchestrator_para_id),

1135

10

                    next_assigned_para_id: None,

1136

10

                    failed_para_ids: Default::default(),

1137

10

                    spawned_containers_monitor: Default::default(),

1138

10

                })),

1139

10

                orchestrator_para_id,

1140

10

                collate_on_tanssi,

1141

10

                // Some if collator starts on orchestrator chain

1142

10

                collation_cancellation_constructs: Some(()),

1143

10

                currently_collating_on,

1144

10

1145

10

1146

1147

21

        fn spawn(&self, container_chain_para_id: ParaId, start_collation: bool) {

1148

21

            let (signal, _on_exit) = oneshot::channel();

1149

21

            let currently_collating_on2 = self.currently_collating_on.clone();

1150

21

            let collate_closure = move || {

1151

13

                let mut cco = currently_collating_on2.lock().unwrap();

1152

13

                assert_ne!(

1153

13

                    *cco,

1154

13

                    Some(container_chain_para_id),

1155

                    "Received CollateOn message when we were already collating on this chain: {}",

1156

                    container_chain_para_id

1157

);

1158

13

                *cco = Some(container_chain_para_id);

1159

13

                let (_, receiver) = futures::channel::oneshot::channel();

1160

13

                (CancellationToken::new(), receiver)

1161

13

};

1162

21

            let collate_on: Arc<

1163

21

                dyn Fn() -> (CancellationToken, futures::channel::oneshot::Receiver<()>)

1164

21

                    + Send

1165

21

                    + Sync,

1166

21

            > = Arc::new(collate_closure);

1167

21

            // Dummy db_path for tests, is not actually used

1168

21

            let db_path = PathBuf::from(format!("/tmp/container-{}/db", container_chain_para_id));

1169

21

1170

21

            let old = self

1171

21

                .state

1172

21

                .lock()

1173

21

                .expect("poison error")

1174

21

                .spawned_container_chains

1175

21

                .insert(

1176

21

                    container_chain_para_id,

1177

21

                    ContainerChainState {

1178

21

                        stop_handle: StopContainerChain { signal, id: 0 },

1179

21

                        db_path,

1180

21

},

1181

21

);

1182

21

1183

21

            assert!(

1184

21

                old.is_none(),

1185

                "tried to spawn a container chain that was already running: {}",

1186

                container_chain_para_id

1187

);

1188

1189

21

            if start_collation {

1190

13

                let (_cancellation_token, _exit_receiver) = collate_on();

1191

13

1192

21

1193

1194

15

        fn stop(&self, container_chain_para_id: ParaId) {

1195

15

            let stop_handle = self

1196

15

                .state

1197

15

                .lock()

1198

15

                .expect("poison error")

1199

15

                .spawned_container_chains

1200

15

                .remove(&container_chain_para_id);

1201

15

1202

15

            match stop_handle {

1203

15

                Some(_stop_handle) => {

1204

15

                    log::info!("Stopping container chain {}", container_chain_para_id);

1205

1206

                None => {

1207

                    panic!(

1208

                        "Tried to stop a container chain that is not running: {}",

1209

                        container_chain_para_id

1210

);

1211

1212

1213

1214

            // Update currently_collating_on, if we stopped the chain we are no longer collating there

1215

15

            let mut lco = self.currently_collating_on.lock().unwrap();

1216

15

            if *lco == Some(container_chain_para_id) {

1217

7

                *lco = None;

1218

8

1219

15

1220

1221

35

        fn handle_update_assignment(&mut self, current: Option<ParaId>, next: Option<ParaId>) {

1222

35

            let HandleUpdateAssignmentResult {

1223

35

                chains_to_stop,

1224

35

                chains_to_start,

1225

35

                need_to_restart,

1226

35

            } = handle_update_assignment_state_change(

1227

35

                &mut self.state.lock().unwrap(),

1228

35

                self.orchestrator_para_id,

1229

35

                current,

1230

35

                next,

1231

35

);

1232

35

1233

35

            if current != Some(self.orchestrator_para_id) {

1234

                // If not assigned to orchestrator chain anymore, we need to stop the collator process

1235

27

                let mut cco = self.currently_collating_on.lock().unwrap();

1236

27

                if *cco == Some(self.orchestrator_para_id) {

1237

10

                    *cco = None;

1238

17

1239

27

                self.collation_cancellation_constructs = None;

1240

8

            } else if self.collation_cancellation_constructs.is_none() {

1241

3

                let (_cancellation_token, _exit_notification_receiver) = (self.collate_on_tanssi)();

1242

3

                self.collation_cancellation_constructs = Some(());

1243

5

1244

1245

            // Assert we never start and stop the same container chain

1246

56

            for para_id in &chains_to_start {

1247

21

                if !need_to_restart {

1248

4

                    assert!(

1249

4

                        !chains_to_stop.contains(para_id),

1250

                        "Tried to start and stop same container chain: {}",

1251

                        para_id

1252

);

1253

                } else {

1254

                    // Will try to start and stop container chain with id "current" or "next", so ignore that

1255

17

                    if Some(*para_id) != current && Some(*para_id) != next {

1256

                        assert!(

1257

                            !chains_to_stop.contains(para_id),

1258

                            "Tried to start and stop same container chain: {}",

1259

                            para_id

1260

);

1261

17

1262

1263

1264

            // Assert we never start or stop the orchestrator chain

1265

35

            assert!(!chains_to_start.contains(&self.orchestrator_para_id));

1266

35

            assert!(!chains_to_stop.contains(&self.orchestrator_para_id));

1267

1268

            // Stop all container chains that are no longer needed

1269

50

            for para_id in chains_to_stop {

1270

15

                self.stop(para_id);

1271

15

1272

1273

            // Start all new container chains (usually 1)

1274

56

            for para_id in chains_to_start {

1275

21

                // Edge case: when starting the node it may be assigned to a container chain, so we need to

1276

21

                // start a container chain already collating.

1277

21

                let start_collation = Some(para_id) == current;

1278

21

                self.spawn(para_id, start_collation);

1279

21

1280

1281

            // Assert that if we are currently assigned to a container chain, we are collating there

1282

35

            if let Some(para_id) = current {

1283

24

                self.assert_collating_on(Some(para_id));

1284

24

            } else {

1285

11

                self.assert_collating_on(None);

1286

11

1287

35

1288

1289

        #[track_caller]

1290

71

        fn assert_collating_on(&self, para_id: Option<ParaId>) {

1291

71

            let currently_collating_on = *self.currently_collating_on.lock().unwrap();

1292

71

            assert_eq!(currently_collating_on, para_id);

1293

71

1294

1295

        #[track_caller]

1296

36

        fn assert_running_chains(&self, para_ids: &[ParaId]) {

1297

36

            let mut actually_running: Vec<ParaId> = self

1298

36

                .state

1299

36

                .lock()

1300

36

                .unwrap()

1301

36

                .spawned_container_chains

1302

36

                .keys()

1303

36

                .cloned()

1304

36

                .collect();

1305

36

            actually_running.sort();

1306

36

            let mut should_be_running = para_ids.to_vec();

1307

36

            should_be_running.sort();

1308

36

            assert_eq!(actually_running, should_be_running);

1309

36

1310

1311

1312

    #[test]

1313

1

    fn starts_collating_on_tanssi() {

1314

1

        let mut m = MockContainerChainSpawner::new();

1315

1

        m.assert_collating_on(Some(1000.into()));

1316

1

        m.assert_running_chains(&[]);

1317

1

1318

1

        m.handle_update_assignment(None, None);

1319

1

        m.assert_collating_on(None);

1320

1

        m.assert_running_chains(&[]);

1321

1

1322

1323

    #[test]

1324

1

    fn assigned_to_orchestrator_chain() {

1325

1

        let mut m = MockContainerChainSpawner::new();

1326

1

1327

1

        m.handle_update_assignment(Some(1000.into()), Some(1000.into()));

1328

1

        m.assert_collating_on(Some(1000.into()));

1329

1

        m.assert_running_chains(&[]);

1330

1

1331

1

        m.handle_update_assignment(Some(1000.into()), None);

1332

1

        m.assert_collating_on(Some(1000.into()));

1333

1

        m.assert_running_chains(&[]);

1334

1

1335

1

        m.handle_update_assignment(None, None);

1336

1

        m.assert_collating_on(None);

1337

1

        m.assert_running_chains(&[]);

1338

1

1339

1

        m.handle_update_assignment(None, Some(1000.into()));

1340

1

        m.assert_collating_on(None);

1341

1

        m.assert_running_chains(&[]);

1342

1

1343

1

        m.handle_update_assignment(Some(1000.into()), Some(1000.into()));

1344

1

        m.assert_collating_on(Some(1000.into()));

1345

1

        m.assert_running_chains(&[]);

1346

1

1347

1348

    #[test]

1349

1

    fn assigned_to_container_chain() {

1350

1

        let mut m = MockContainerChainSpawner::new();

1351

1

1352

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1353

1

        m.assert_collating_on(Some(2000.into()));

1354

1

        m.assert_running_chains(&[2000.into()]);

1355

1

1356

1

        m.handle_update_assignment(Some(2000.into()), None);

1357

1

        m.assert_collating_on(Some(2000.into()));

1358

1

        m.assert_running_chains(&[2000.into()]);

1359

1

1360

1

        m.handle_update_assignment(None, None);

1361

1

        m.assert_collating_on(None);

1362

1

        m.assert_running_chains(&[]);

1363

1

1364

1

        m.handle_update_assignment(None, Some(2000.into()));

1365

1

        m.assert_collating_on(None);

1366

1

        m.assert_running_chains(&[2000.into()]);

1367

1

1368

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1369

1

        m.assert_collating_on(Some(2000.into()));

1370

1

        m.assert_running_chains(&[2000.into()]);

1371

1

1372

1373

    #[test]

1374

1

    fn spawn_container_chains() {

1375

1

        let mut m = MockContainerChainSpawner::new();

1376

1

1377

1

        m.handle_update_assignment(Some(1000.into()), Some(2000.into()));

1378

1

        m.assert_collating_on(Some(1000.into()));

1379

1

        m.assert_running_chains(&[2000.into()]);

1380

1

1381

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1382

1

        m.assert_collating_on(Some(2000.into()));

1383

1

        m.assert_running_chains(&[2000.into()]);

1384

1

1385

1

        m.handle_update_assignment(Some(2000.into()), Some(2001.into()));

1386

1

        m.assert_collating_on(Some(2000.into()));

1387

1

        m.assert_running_chains(&[2000.into(), 2001.into()]);

1388

1

1389

1

        m.handle_update_assignment(Some(2001.into()), Some(2001.into()));

1390

1

        m.assert_collating_on(Some(2001.into()));

1391

1

        m.assert_running_chains(&[2001.into()]);

1392

1

1393

1

        m.handle_update_assignment(Some(2001.into()), Some(1000.into()));

1394

1

        m.assert_collating_on(Some(2001.into()));

1395

1

        m.assert_running_chains(&[2001.into()]);

1396

1

1397

1

        m.handle_update_assignment(Some(1000.into()), Some(1000.into()));

1398

1

        m.assert_collating_on(Some(1000.into()));

1399

1

        m.assert_running_chains(&[]);

1400

1

1401

1402

    #[test]

1403

1

    fn swap_current_next() {

1404

1

        // Going from (2000, 2001) to (2001, 2000) shouldn't start or stop any container chains

1405

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1406

1

1407

1

        m.handle_update_assignment(Some(2000.into()), Some(2001.into()));

1408

1

        m.assert_collating_on(Some(2000.into()));

1409

1

        m.assert_running_chains(&[2000.into(), 2001.into()]);

1410

1

1411

1

        m.handle_update_assignment(Some(2001.into()), Some(2000.into()));

1412

1

        m.assert_collating_on(Some(2001.into()));

1413

1

        m.assert_running_chains(&[2000.into(), 2001.into()]);

1414

1

1415

1416

    #[test]

1417

1

    fn stop_collating_orchestrator() {

1418

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1419

1

1420

1

        m.handle_update_assignment(Some(1000.into()), Some(1000.into()));

1421

1

        m.assert_collating_on(Some(1000.into()));

1422

1

        m.assert_running_chains(&[]);

1423

1

1424

1

        m.handle_update_assignment(Some(1000.into()), None);

1425

1

        m.assert_collating_on(Some(1000.into()));

1426

1

        m.assert_running_chains(&[]);

1427

1

1428

1

        m.handle_update_assignment(None, None);

1429

1

        m.assert_collating_on(None);

1430

1

        m.assert_running_chains(&[]);

1431

1

1432

1

        m.handle_update_assignment(Some(1000.into()), None);

1433

1

        m.assert_collating_on(Some(1000.into()));

1434

1

        m.assert_running_chains(&[]);

1435

1

1436

1437

    #[test]

1438

1

    fn stop_collating_container() {

1439

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1440

1

1441

1

        m.handle_update_assignment(Some(2000.into()), None);

1442

1

        m.assert_collating_on(Some(2000.into()));

1443

1

        m.assert_running_chains(&[2000.into()]);

1444

1

1445

1

        m.handle_update_assignment(None, None);

1446

1

        m.assert_collating_on(None);

1447

1

        m.assert_running_chains(&[]);

1448

1

1449

1

        m.handle_update_assignment(None, Some(2000.into()));

1450

1

        m.assert_collating_on(None);

1451

1

        m.assert_running_chains(&[2000.into()]);

1452

1

1453

1

        // This will send a CollateOn message to the same chain as the last CollateOn,

1454

1

        // but this is needed because that chain has been stopped

1455

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1456

1

        m.assert_collating_on(Some(2000.into()));

1457

1

        m.assert_running_chains(&[2000.into()]);

1458

1

1459

1460

    #[test]

1461

1

    fn stop_collating_container_start_immediately() {

1462

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1463

1

1464

1

        m.handle_update_assignment(Some(2000.into()), None);

1465

1

        m.assert_collating_on(Some(2000.into()));

1466

1

        m.assert_running_chains(&[2000.into()]);

1467

1

1468

1

        m.handle_update_assignment(None, None);

1469

1

        m.assert_collating_on(None);

1470

1

        m.assert_running_chains(&[]);

1471

1

1472

1

        // This will start the chain already collating

1473

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1474

1

        m.assert_collating_on(Some(2000.into()));

1475

1

        m.assert_running_chains(&[2000.into()]);

1476

1

1477

1478

    #[test]

1479

1

    fn stop_all_chains() {

1480

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1481

1

1482

1

        m.handle_update_assignment(Some(2000.into()), Some(2001.into()));

1483

1

        m.assert_collating_on(Some(2000.into()));

1484

1

        m.assert_running_chains(&[2000.into(), 2001.into()]);

1485

1

1486

1

        m.handle_update_assignment(None, None);

1487

1

        m.assert_collating_on(None);

1488

1

        m.assert_running_chains(&[]);

1489

1

1490

1491

    #[test]

1492

1

    fn keep_collating_on_container() {

1493

1

        let mut m: MockContainerChainSpawner = MockContainerChainSpawner::new();

1494

1

1495

1

        m.handle_update_assignment(Some(2000.into()), None);

1496

1

        m.assert_collating_on(Some(2000.into()));

1497

1

        m.assert_running_chains(&[2000.into()]);

1498

1

1499

1

        m.handle_update_assignment(None, Some(2000.into()));

1500

1

        m.assert_collating_on(None);

1501

1

        m.assert_running_chains(&[2000.into()]);

1502

1

1503

1

        m.handle_update_assignment(Some(2000.into()), Some(2000.into()));

1504

1

        m.assert_collating_on(Some(2000.into()));

1505

1

        m.assert_running_chains(&[2000.into()]);

1506

1

1507

1508

    #[test]

1509

1

    fn invalid_boot_nodes_are_ignored() {

1510

1

        let para_id = 100.into();

1511

1

        let bootnode1 =

1512

1

            b"/ip4/127.0.0.1/tcp/33049/ws/p2p/12D3KooWHVMhQDHBpj9vQmssgyfspYecgV6e3hH1dQVDUkUbCYC9"

1513

1

                .to_vec();

1514

1

        assert_eq!(

1515

1

            parse_boot_nodes_ignore_invalid(vec![b"A".to_vec()], para_id),

1516

1

            vec![]

1517

1

);

1518

1

        assert_eq!(

1519

1

            parse_boot_nodes_ignore_invalid(vec![b"\xff".to_vec()], para_id),

1520

1

            vec![]

1521

1

);

1522

        // Valid boot nodes are not ignored

1523

1

        assert_eq!(

1524

1

            parse_boot_nodes_ignore_invalid(vec![bootnode1], para_id).len(),

1525

1

1526

1

);

1527

1

1528

1529

    #[test]

1530

1

    fn path_ancestors() {

1531

1

        // Test the implementation of `delete_container_chain_db`

1532

1

        let db_path = PathBuf::from("/tmp/zombienet/Collator2002-01/data/containers/chains/simple_container_2002/paritydb/full-container-2002");

1533

1

        let parent = db_path.ancestors().nth(2).unwrap();

1534

1

1535

1

        assert_eq!(

1536

1

            parent,

1537

1

            PathBuf::from(

1538

1

                "/tmp/zombienet/Collator2002-01/data/containers/chains/simple_container_2002"

1539

1

1540

1

1541

1

1542