objectstore_service/backend/
changelog.rs

1//! Change lifecycle tracking and durable write-ahead log.
2//!
3//! When a storage mutation spans both the high-volume (HV) and long-term (LT)
4//! backends, several non-atomic steps must happen in sequence: uploading to LT,
5//! committing a tombstone in HV via compare-and-swap, and cleaning up
6//! unreferenced blobs. A crash at any point can leave orphaned LT blobs.
7//!
8//! This module provides two layers of protection:
9//!
10//! 1. **In-process tracking** — [`ChangeGuard`] is an RAII guard that tracks
11//!    the current [`ChangePhase`] of an operation. When dropped, it spawns a
12//!    background task to clean up whichever blob is unreferenced based on the
13//!    phase reached before the drop. This handles normal errors and early
14//!    returns within a running process.
15//!
16//! 2. **Durable write-ahead log** — The [`ChangeLog`] trait records a
17//!    [`Change`] to durable storage *before* any LT side effects begin. If the
18//!    process crashes, a recovery scan reads outstanding entries and cleans up
19//!    orphaned blobs. Recovery is garbage collection — it never replays CAS
20//!    mutations or finishes incomplete operations.
21
22use std::collections::HashMap;
23use std::fmt;
24use std::sync::{Arc, Mutex};
25use std::time::{Duration, SystemTime};
26
27use tokio_util::task::TaskTracker;
28use tokio_util::task::task_tracker::TaskTrackerToken;
29
30use crate::backend::common::{HighVolumeBackend, MultipartUploadBackend, TieredMetadata};
31use crate::error::Result;
32use crate::id::ObjectId;
33
34/// Initial delay for exponential backoff retries in background cleanup tasks.
35const INITIAL_BACKOFF: Duration = Duration::from_millis(100);
36/// Maximum delay for exponential backoff retries in background cleanup tasks.
37const MAX_BACKOFF: Duration = Duration::from_secs(30);
38
39/// Unique identifier for a change log entry.
40///
41/// Generated per-operation as a UUIDv7. In durable storage, scoped to the
42/// owning service instance (e.g., `~oplog/{instance_id}/{change_id}`).
43#[derive(Debug, Clone, PartialEq, Eq, Hash)]
44pub struct ChangeId(uuid::Uuid);
45
46impl ChangeId {
47    /// Generates a new unique change ID.
48    #[allow(clippy::new_without_default)]
49    pub fn new() -> Self {
50        Self(uuid::Uuid::now_v7())
51    }
52}
53
54impl fmt::Display for ChangeId {
55    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
56        self.0.fmt(f)
57    }
58}
59
60/// Describes the LT blobs involved in a multi-step storage change.
61///
62/// Every mutating flow maps to: "I may have written a `new` LT blob and I
63/// may be replacing an `old` LT blob." Recovery uses these fields to determine
64/// which blobs are orphaned by reading the current HV state.
65#[derive(Debug, Clone)]
66pub struct Change {
67    /// The logical object being mutated.
68    ///
69    /// Used by cleanup to query HV and determine which blob is currently referenced.
70    pub id: ObjectId,
71    /// The new LT blob written by this operation.
72    ///
73    /// Needs cleanup on failure (the CAS did not commit).
74    pub new: Option<ObjectId>,
75    /// The old LT blob being replaced.
76    ///
77    /// Needs cleanup on success (the CAS committed and the old blob is unreferenced).
78    pub old: Option<ObjectId>,
79    /// Earliest time at which this entry becomes eligible for cleanup.
80    ///
81    /// [`ChangeLog::scan`] filters out the entry, unless the deadline has passed.
82    pub cleanup_after: Option<SystemTime>,
83}
84
85/// Manager for multi-step storage changes, including backends and durable log.
86///
87/// Encapsulates the state and logic for recording changes, advancing their phases,
88/// and performing cleanup on drop. The `TieredStorage` backend holds an instance
89/// of this manager to use it for its multi-step operations.
90#[derive(Debug)]
91pub struct ChangeManager {
92    /// The backend for small objects (≤ 1 MiB).
93    pub(crate) high_volume: Box<dyn HighVolumeBackend>,
94    /// The backend for large objects (> 1 MiB).
95    pub(crate) long_term: Box<dyn MultipartUploadBackend>,
96    /// Durable write-ahead log for multi-step changes.
97    pub(crate) changelog: Box<dyn ChangeLog>,
98    /// Tracks outstanding background cleanup operations for graceful shutdown.
99    pub(crate) tracker: TaskTracker,
100}
101
102impl ChangeManager {
103    /// Creates a new `ChangeManager` with the given backends and changelog.
104    pub fn new(
105        high_volume: Box<dyn HighVolumeBackend>,
106        long_term: Box<dyn MultipartUploadBackend>,
107        changelog: Box<dyn ChangeLog>,
108    ) -> Arc<Self> {
109        Arc::new(Self {
110            high_volume,
111            long_term,
112            changelog,
113            tracker: TaskTracker::new(),
114        })
115    }
116
117    /// Records the change to the log and returns a guard.
118    ///
119    /// Generates a unique [`ChangeId`] and writes a durable log entry before
120    /// returning. The caller may proceed with LT side effects immediately after.
121    ///
122    /// When the [`ChangeGuard`] is dropped, a background process is spawned to
123    /// clean up any unreferenced objects in LT storage.
124    #[tracing::instrument(level = "debug", fields(id = ?change.id, new = ?change.new, old = ?change.old), skip_all)]
125    pub async fn record(self: Arc<Self>, change: Change) -> Result<ChangeGuard> {
126        let token = self.tracker.token();
127
128        let id = ChangeId::new();
129        self.changelog.record(&id, &change).await?;
130
131        let state = ChangeState {
132            id,
133            change,
134            phase: ChangePhase::Recorded,
135            manager: self.clone(),
136            _token: token,
137        };
138
139        Ok(ChangeGuard { state: Some(state) })
140    }
141
142    /// Records the change to the log and returns a guard in the `Assembling` state.
143    ///
144    /// Behaves like [`Self::record`], except that the guard is created in the `Assembling` state.
145    /// Unlike other states, this guard does nothing on drop, leaving the burden of cleaning up to
146    /// the [`ChangeLog`].
147    #[tracing::instrument(level = "debug", fields(id = ?change.id, new = ?change.new, old = ?change.old), skip_all)]
148    pub async fn record_assembling(self: Arc<Self>, change: Change) -> Result<ChangeGuard> {
149        let token = self.tracker.token();
150
151        let id = ChangeId::new();
152        self.changelog.record(&id, &change).await?;
153
154        let state = ChangeState {
155            id,
156            change,
157            phase: ChangePhase::Assembling,
158            manager: self.clone(),
159            _token: token,
160        };
161
162        Ok(ChangeGuard { state: Some(state) })
163    }
164
165    /// Scans the changelog for outstanding entries and runs cleanup for each.
166    ///
167    /// Spawn this into a background task at startup to recover from any orphaned objects after a
168    /// crash. During normal operation, this should return an empty list and have no effect.
169    pub async fn recover(self: Arc<Self>) -> Result<()> {
170        // Hold one token for the duration of recovery to prevent premature shutdown.
171        let _token = self.tracker.token();
172
173        let entries =
174            self.changelog.scan().await.inspect_err(|e| {
175                objectstore_log::error!(!!e, "Failed to run changelog recovery")
176            })?;
177
178        // NB: Intentionally clean up sequentially to reduce load on the system.
179        for (id, change) in entries {
180            let state = ChangeState {
181                id,
182                change,
183                phase: ChangePhase::Recovered,
184                manager: self.clone(),
185                _token: self.tracker.token(),
186            };
187
188            state.cleanup().await;
189        }
190
191        Ok(())
192    }
193}
194
195/// Durable write-ahead log for multi-step storage changes.
196///
197/// Records in-progress changes that span both HV and LT backends so that
198/// recovery can identify and clean up orphaned LT blobs after crashes.
199/// The log is stored independently from the data backend (though it may
200/// share infrastructure) and is scoped per service instance.
201///
202/// Recovery is garbage collection — it reads HV state to determine which
203/// blobs are unreferenced and deletes them. It never replays CAS mutations
204/// or finishes incomplete operations.
205///
206/// Implementations handle instance identity, heartbeats, and key prefixing
207/// internally — callers interact only with entries.
208#[async_trait::async_trait]
209pub trait ChangeLog: fmt::Debug + Send + Sync {
210    /// Records a change before any side effects begin (write-ahead).
211    ///
212    /// Must be durable before returning — the caller will proceed with
213    /// LT writes immediately after.
214    async fn record(&self, id: &ChangeId, change: &Change) -> Result<()>;
215
216    /// Removes a completed change from the log.
217    ///
218    /// Called after all cleanup (LT blob deletion) is finished. Removing
219    /// a nonexistent entry is not an error (idempotent).
220    async fn remove(&self, id: &ChangeId) -> Result<()>;
221
222    /// Returns all outstanding changes eligible for recovery.
223    ///
224    /// During normal operation this returns only the calling instance's
225    /// entries. During recovery of a dead instance, the implementation
226    /// may return that instance's entries after the caller has claimed
227    /// ownership (via heartbeat CAS).
228    ///
229    /// The returned entries are unordered.
230    async fn scan(&self) -> Result<Vec<(ChangeId, Change)>>;
231}
232
233/// In-memory [`ChangeLog`] for tests and deployments without durable logging.
234///
235/// Stores entries in a `HashMap`. [`Clone`]-able so tests can hold a handle
236/// for direct inspection while the service owns a boxed copy.
237#[derive(Debug, Clone, Default)]
238pub struct InMemoryChangeLog {
239    entries: Arc<Mutex<HashMap<ChangeId, Change>>>,
240}
241
242#[async_trait::async_trait]
243impl ChangeLog for InMemoryChangeLog {
244    async fn record(&self, id: &ChangeId, change: &Change) -> Result<()> {
245        let mut entries = self.entries.lock().expect("lock poisoned");
246        entries.insert(id.clone(), change.clone());
247        Ok(())
248    }
249
250    async fn remove(&self, id: &ChangeId) -> Result<()> {
251        let mut entries = self.entries.lock().expect("lock poisoned");
252        entries.remove(id);
253        Ok(())
254    }
255
256    async fn scan(&self) -> Result<Vec<(ChangeId, Change)>> {
257        let now = SystemTime::now();
258        let entries = self.entries.lock().expect("lock poisoned");
259        let result = entries
260            .iter()
261            .filter(|(_, change)| match change.cleanup_after {
262                None => true,
263                Some(deadline) => now >= deadline,
264            })
265            .map(|(id, change)| (id.clone(), change.clone()))
266            .collect();
267        Ok(result)
268    }
269}
270
271#[cfg(test)]
272impl InMemoryChangeLog {
273    /// Sets [`Change::cleanup_after`] to the past for all entries, forcing them to be returned by a subsequent [`ChangeLog::scan`].
274    pub fn expire_all(&self) {
275        let mut entries = self.entries.lock().expect("lock poisoned");
276        for change in entries.values_mut() {
277            change.cleanup_after = Some(SystemTime::UNIX_EPOCH);
278        }
279    }
280}
281
282/// [`ChangeLog`] implementation that discards all entries.
283///
284/// Used as the default when no durable log is configured. Provides no
285/// crash-recovery guarantees — orphan cleanup relies entirely on in-process
286/// [`ChangeGuard`] drop logic.
287#[derive(Debug, Default)]
288pub struct NoopChangeLog;
289
290#[async_trait::async_trait]
291impl ChangeLog for NoopChangeLog {
292    async fn record(&self, _id: &ChangeId, _change: &Change) -> Result<()> {
293        Ok(())
294    }
295
296    async fn remove(&self, _id: &ChangeId) -> Result<()> {
297        Ok(())
298    }
299
300    async fn scan(&self) -> Result<Vec<(ChangeId, Change)>> {
301        Ok(Vec::new())
302    }
303}
304
305/// Phase of a multi-step storage change.
306#[derive(Debug, PartialEq, Eq)]
307pub enum ChangePhase {
308    /// The change was recovered from changelog and the phase is unknown.
309    Recovered,
310    /// The change is recorded in the log and LT upload has started.
311    Recorded,
312    /// The LT blob originated from a multipart upload and is being assembled.
313    ///
314    /// Multipart upload completion can fail, and we want the client to be able to retry it
315    /// without the change cleanup process racing to delete the LT blob.
316    /// Therefore, cleanup of changes in this phase is deferred.
317    Assembling,
318    /// LT upload has succeeded and the tombstone is being updated.
319    Written,
320    /// The tombstone update failed due to a conflict.
321    Lost,
322    /// The tombstone update succeeded.
323    Updated,
324    /// Cleanup complete.
325    Completed,
326}
327
328impl ChangePhase {
329    /// Returns the phase corresponding to the outcome of a compare-and-write operation.
330    pub fn compare_and_write(succeeded: bool) -> Self {
331        if succeeded { Self::Updated } else { Self::Lost }
332    }
333}
334
335/// Internal state for a [`ChangeGuard`].
336///
337/// Logs an error if dropped in any phase other than `Completed`.
338#[derive(Debug)]
339struct ChangeState {
340    id: ChangeId,
341    change: Change,
342    phase: ChangePhase,
343    manager: Arc<ChangeManager>,
344    _token: TaskTrackerToken,
345}
346
347impl ChangeState {
348    /// Marks the operation as completed, preventing any cleanup on drop.
349    fn mark_completed(mut self) {
350        self.phase = ChangePhase::Completed;
351    }
352
353    /// Determines tombstone state and runs cleanup for unreferenced objects.
354    async fn cleanup(self) {
355        let current = match self.phase {
356            // For `Recovered`, we must first check the state of the tombstone.
357            ChangePhase::Recovered => self.read_tombstone().await,
358            ChangePhase::Recorded => self.change.old.clone(),
359            // For `Written`, the CAS outcome is unknown — read HV to determine it.
360            ChangePhase::Written => self.read_tombstone().await,
361            ChangePhase::Lost => self.change.old.clone(),
362            ChangePhase::Updated => self.change.new.clone(),
363            ChangePhase::Assembling | ChangePhase::Completed => return, // unreachable
364        };
365
366        if current != self.change.old
367            && let Some(ref old) = self.change.old
368        {
369            self.cleanup_lt(old).await;
370        }
371
372        if current != self.change.new
373            && let Some(ref new) = self.change.new
374        {
375            self.cleanup_lt(new).await;
376        }
377
378        self.cleanup_log().await;
379        self.mark_completed();
380    }
381
382    /// Reads the tombstone target for `id` from HV, retrying with exponential backoff on error.
383    ///
384    /// Returns `None` if the entry holds an inline object or is absent.
385    async fn read_tombstone(&self) -> Option<ObjectId> {
386        let mut delay = INITIAL_BACKOFF;
387        loop {
388            match self
389                .manager
390                .high_volume
391                .get_tiered_metadata(&self.change.id)
392                .await
393            {
394                Ok(TieredMetadata::Tombstone(t)) => return Some(t.target),
395                Ok(TieredMetadata::Object(_)) => return None,
396                Ok(TieredMetadata::NotFound) => return None,
397                Err(_) => {
398                    tokio::time::sleep(delay).await;
399                    delay = (delay.mul_f32(1.5)).min(MAX_BACKOFF);
400                }
401            }
402        }
403    }
404
405    /// Deletes `target` from `lt`, retrying with exponential backoff until success.
406    async fn cleanup_lt(&self, target: &ObjectId) {
407        let mut delay = INITIAL_BACKOFF;
408        while self.manager.long_term.delete_object(target).await.is_err() {
409            tokio::time::sleep(delay).await;
410            delay = (delay.mul_f32(1.5)).min(MAX_BACKOFF);
411        }
412    }
413
414    /// Removes this change's log entry, retrying with exponential backoff until success.
415    async fn cleanup_log(&self) {
416        let mut delay = INITIAL_BACKOFF;
417        while self.manager.changelog.remove(&self.id).await.is_err() {
418            tokio::time::sleep(delay).await;
419            delay = (delay.mul_f32(1.5)).min(MAX_BACKOFF);
420        }
421    }
422}
423
424impl Drop for ChangeState {
425    fn drop(&mut self) {
426        match self.phase {
427            ChangePhase::Completed => {}
428            ChangePhase::Assembling => {
429                objectstore_log::warn!(
430                    change = ?self.change,
431                    "Operation dropped in Assembling state, cleanup deferred to ChangeLog recovery"
432                );
433            }
434            _ => {
435                objectstore_log::error!(
436                    change = ?self.change,
437                    phase = ?self.phase,
438                    "Operation dropped without completing cleanup"
439                );
440            }
441        }
442    }
443}
444
445/// RAII guard that tracks cleanup state for a multi-step storage change.
446///
447/// When dropped in a non-`Completed` phase, determines the LT blob to clean up
448/// and spawns a background task to delete it. If no tokio runtime is available
449/// (e.g., during shutdown), the drop logs an error instead of panicking.
450#[derive(Debug)]
451pub struct ChangeGuard {
452    state: Option<ChangeState>,
453}
454
455impl ChangeGuard {
456    /// Advances the operation to the given phase. Zero-cost, no I/O.
457    pub(crate) fn advance(&mut self, phase: ChangePhase) {
458        if let Some(ref mut state) = self.state {
459            state.phase = phase;
460        }
461    }
462}
463
464impl Drop for ChangeGuard {
465    fn drop(&mut self) {
466        if let Some(state) = self.state.take()
467            && state.phase != ChangePhase::Assembling
468            && state.phase != ChangePhase::Completed
469            && let Ok(handle) = tokio::runtime::Handle::try_current()
470        {
471            handle.spawn(state.cleanup());
472        }
473
474        // NB: Drop of `ChangeState` logs an error if cleanup is not scheduled.
475    }
476}
477
478#[cfg(test)]
479mod tests {
480    use objectstore_types::scope::{Scope, Scopes};
481
482    use super::*;
483    use crate::id::ObjectContext;
484
485    fn make_id(key: &str) -> ObjectId {
486        ObjectId::new(
487            ObjectContext {
488                usecase: "testing".into(),
489                scopes: Scopes::from_iter([Scope::create("testing", "value").unwrap()]),
490            },
491            key.into(),
492        )
493    }
494
495    #[tokio::test]
496    async fn record_then_scan_returns_entry() {
497        let log = InMemoryChangeLog::default();
498        let id = ChangeId::new();
499        let change = Change {
500            id: make_id("object-key"),
501            new: Some(make_id("object-key/rev1")),
502            old: None,
503            cleanup_after: None,
504        };
505
506        log.record(&id, &change).await.unwrap();
507
508        let entries = log.scan().await.unwrap();
509        assert_eq!(entries.len(), 1);
510        assert_eq!(entries[0].0, id);
511    }
512
513    #[tokio::test]
514    async fn remove_then_scan_does_not_return_entry() {
515        let log = InMemoryChangeLog::default();
516        let id = ChangeId::new();
517        let change = Change {
518            id: make_id("object-key"),
519            new: None,
520            old: Some(make_id("object-key/rev1")),
521            cleanup_after: None,
522        };
523
524        log.record(&id, &change).await.unwrap();
525        log.remove(&id).await.unwrap();
526
527        let entries = log.scan().await.unwrap();
528        assert!(entries.is_empty());
529    }
530
531    #[tokio::test]
532    async fn remove_nonexistent_entry_is_not_an_error() {
533        let log = InMemoryChangeLog::default();
534        let id = ChangeId::new();
535
536        log.remove(&id).await.unwrap();
537    }
538
539    /// When the tokio runtime is dropped while an operation is in flight, the `ChangeGuard`
540    /// drops outside any runtime and cannot schedule cleanup. The log entry must persist
541    /// so that a future recovery pass can identify and clean up orphaned blobs.
542    #[test]
543    fn runtime_drop_while_pending_preserves_log_entry() {
544        use crate::backend::in_memory::InMemoryBackend;
545
546        let log = InMemoryChangeLog::default();
547        let manager = ChangeManager::new(
548            Box::new(InMemoryBackend::new("hv")),
549            Box::new(InMemoryBackend::new("lt")),
550            Box::new(log.clone()),
551        );
552
553        let guard = {
554            let rt = tokio::runtime::Runtime::new().unwrap();
555            // Simulate a mid-flight operation that recorded its change but did not complete.
556            rt.block_on(manager.record(Change {
557                id: make_id("crash-test"),
558                new: Some(make_id("crash-test/rev")),
559                old: None,
560                cleanup_after: None,
561            }))
562            .unwrap()
563            // Runtime drops here while `guard` is still alive outside it.
564        };
565
566        // Guard drops with no runtime active: cleanup cannot be scheduled.
567        drop(guard);
568
569        // Log entry must survive so recovery can clean up the orphaned blob.
570        let rt = tokio::runtime::Runtime::new().unwrap();
571        let entries = rt.block_on(log.scan()).unwrap();
572        assert_eq!(entries.len(), 1, "log entry must persist");
573    }
574
575    #[tokio::test]
576    async fn scan_filters_by_cleanup_after() {
577        let log = InMemoryChangeLog::default();
578
579        let ready_id = ChangeId::new();
580        log.record(
581            &ready_id,
582            &Change {
583                id: make_id("ready"),
584                new: Some(make_id("ready/rev")),
585                old: None,
586                cleanup_after: None,
587            },
588        )
589        .await
590        .unwrap();
591
592        let expired_id = ChangeId::new();
593        log.record(
594            &expired_id,
595            &Change {
596                id: make_id("expired"),
597                new: Some(make_id("expired/rev")),
598                old: None,
599                cleanup_after: Some(SystemTime::now() - Duration::from_secs(1)),
600            },
601        )
602        .await
603        .unwrap();
604
605        let deferred_id = ChangeId::new();
606        log.record(
607            &deferred_id,
608            &Change {
609                id: make_id("deferred"),
610                new: Some(make_id("deferred/rev")),
611                old: None,
612                cleanup_after: Some(SystemTime::now() + Duration::from_hours(24)),
613            },
614        )
615        .await
616        .unwrap();
617
618        let entries = log.scan().await.unwrap();
619        assert_eq!(entries.len(), 2);
620
621        let ids: Vec<_> = entries.iter().map(|(id, _)| id).collect();
622        assert!(ids.contains(&&ready_id));
623        assert!(ids.contains(&&expired_id));
624    }
625}
objectstore_service/backend/changelog.rs

objectstore_service/backend/
changelog.rs