CRGC: Fault-Recovering Actor Garbage Collection in Pekko
Dan Plyukhin, Gul Agha, and Fabrizio Montesi. PLDI 2025.
PDF |
Video |
Slides |
Cite
@article{plyukhin2025crgc,
author = {Plyukhin, Dan and Agha, Gul and Montesi, Fabrizio},
title = {CRGC: Fault-Recovering Actor Garbage Collection in Pekko},
year = {2025},
issue_date = {June 2025},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {9},
number = {PLDI},
url = {https://doi.org/10.1145/3729288},
doi = {10.1145/3729288},
abstract = {Actors are lightweight reactive processes that communicate by asynchronous message-passing. Actors address common problems like concurrency control and fault tolerance, but resource management remains challenging: in all four of the most popular actor frameworks (Pekko, Akka, Erlang, and Elixir) programmers must explicitly kill actors to free up resources. To simplify resource management, researchers have devised actor garbage collectors (actor GCs) that monitor the application and detect when actors are safe to kill. However, existing actor GCs are impractical for distributed systems where the network is unreliable and nodes can fail. The simplest actor GCs do not collect cyclic garbage, whereas more sophisticated actor GCs are not fault-recovering: dropped messages and crashed nodes can cause actors to become garbage that never gets collected. We present Conflict-free Replicated Garbage Collection (CRGC): the first fault-recovering cyclic actor GC. In CRGC, actors and nodes record information locally and broadcast updates to the garbage collectors running on each node. CRGC does not require locks, explicit memory barriers, or any assumptions about message delivery order, except for reliable FIFO channels from actors to their local garbage collector. Moreover, CRGC is simple: we concisely present its operational semantics, which has been formalized in TLA+, and prove both soundness (non-garbage actors are never killed) and completeness (all garbage actors are eventually killed, under reasonable assumptions). We also present a preliminary implementation in Apache Pekko and measure its performance using two actor benchmark suites. Our results show the performance overhead of CRGC is competitive with simpler approaches like weighted reference counting, while also being much more powerful.},
journal = {Proc. ACM Program. Lang.},
month = jun,
articleno = {185},
numpages = {25},
keywords = {actor model, actors, distributed systems, fault tolerance, garbage collection}
}
Ozone: Fully Out-of-Order Choreographies
Dan Plyukhin, Marco Peressotti, and Fabrizio Montesi. ECOOP 2024.
PDF |
Video |
Slides |
Cite
@inproceedings{DBLP:conf/ecoop/PlyukhinPM24,
author = {Dan Plyukhin and
Marco Peressotti and
Fabrizio Montesi},
editor = {Jonathan Aldrich and
Guido Salvaneschi},
title = {Ozone: Fully Out-of-Order Choreographies},
booktitle = {38th European Conference on Object-Oriented Programming, {ECOOP} 2024,
September 16-20, 2024, Vienna, Austria},
series = {LIPIcs},
volume = {313},
pages = {31:1--31:28},
publisher = {Schloss Dagstuhl - Leibniz-Zentrum f{\"{u}}r Informatik},
year = {2024},
url = {https://doi.org/10.4230/LIPIcs.ECOOP.2024.31},
doi = {10.4230/LIPICS.ECOOP.2024.31},
timestamp = {Mon, 03 Mar 2025 21:02:51 +0100},
biburl = {https://dblp.org/rec/conf/ecoop/PlyukhinPM24.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}