BibTeX records: Christian Engelmann

download as .bib file

@inproceedings{DBLP:conf/europlop/EngelmannS23,
  author       = {Christian Engelmann and
                  Suhas Somnath},
  title        = {Science Use Case Design Patterns for Autonomous Experiments},
  booktitle    = {Proceedings of the 28th European Conference on Pattern Languages of
                  Programs, EuroPLoP 2023, Irsee, Germany, July 5-9, 2023},
  pages        = {26:1--26:14},
  publisher    = {{ACM}},
  year         = {2023},
  url          = {https://doi.org/10.1145/3628034.3628060},
  doi          = {10.1145/3628034.3628060},
  timestamp    = {Sat, 10 Feb 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/europlop/EngelmannS23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/ijhpca/AgulloAABBBBCCD22,
  author       = {Emmanuel Agullo and
                  Mirco Altenbernd and
                  Hartwig Anzt and
                  Leonardo Bautista{-}Gomez and
                  Tommaso Benacchio and
                  Luca Bonaventura and
                  Hans{-}Joachim Bungartz and
                  Sanjay Chatterjee and
                  Florina M. Ciorba and
                  Nathan DeBardeleben and
                  Daniel Drzisga and
                  Sebastian Eibl and
                  Christian Engelmann and
                  Wilfried N. Gansterer and
                  Luc Giraud and
                  Dominik G{\"{o}}ddeke and
                  Marco Heisig and
                  Fabienne J{\'{e}}z{\'{e}}quel and
                  Nils Kohl and
                  Xiaoye Sherry Li and
                  Romain Lion and
                  Miriam Mehl and
                  Paul Mycek and
                  Michael Obersteiner and
                  Enrique S. Quintana{-}Ort{\'{\i}} and
                  Francesco Rizzi and
                  Ulrich R{\"{u}}de and
                  Martin Schulz and
                  Fred Fung and
                  Robert Speck and
                  Linda Stals and
                  Keita Teranishi and
                  Samuel Thibault and
                  Dominik Th{\"{o}}nnes and
                  Andreas Wagner and
                  Barbara I. Wohlmuth},
  title        = {Resiliency in numerical algorithm design for extreme scale simulations},
  journal      = {Int. J. High Perform. Comput. Appl.},
  volume       = {36},
  number       = {2},
  pages        = {251--285},
  year         = {2022},
  url          = {https://doi.org/10.1177/10943420211055188},
  doi          = {10.1177/10943420211055188},
  timestamp    = {Mon, 26 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/ijhpca/AgulloAABBBBCCD22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/smc2/EngelmannKBBNSA22,
  author       = {Christian Engelmann and
                  Olga A. Kuchar and
                  Swen Boehm and
                  Michael J. Brim and
                  Thomas J. Naughton and
                  Suhas Somnath and
                  Scott Atchley and
                  Jack Lange and
                  Ben Mintz and
                  Elke Arenholz},
  editor       = {Douglas B. Kothe and
                  Al Geist and
                  Swaroop Pophale and
                  Hong Liu and
                  Suzanne Parete{-}Koon},
  title        = {The {INTERSECT} Open Federated Architecture for the Laboratory of
                  the Future},
  booktitle    = {Accelerating Science and Engineering Discoveries Through Integrated
                  Research Infrastructure for Experiment, Big Data, Modeling and Simulation
                  - 22nd Smoky Mountains Computational Sciences and Engineering Conference,
                  {SMC} 2022, Virtual Event, August 23-25, 2022, Revised Selected Papers},
  series       = {Communications in Computer and Information Science},
  volume       = {1690},
  pages        = {173--190},
  publisher    = {Springer},
  year         = {2022},
  url          = {https://doi.org/10.1007/978-3-031-23606-8\_11},
  doi          = {10.1007/978-3-031-23606-8\_11},
  timestamp    = {Tue, 21 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/smc2/EngelmannKBBNSA22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/jpdc/KumarGPWSFET21,
  author       = {Mohit Kumar and
                  Saurabh Gupta and
                  Tirthak Patel and
                  Michael Wilder and
                  Weisong Shi and
                  Song Fu and
                  Christian Engelmann and
                  Devesh Tiwari},
  title        = {Study of interconnect errors, network congestion, and applications
                  characteristics for throttle prediction on a large scale {HPC} system},
  journal      = {J. Parallel Distributed Comput.},
  volume       = {153},
  pages        = {29--43},
  year         = {2021},
  url          = {https://doi.org/10.1016/j.jpdc.2021.03.001},
  doi          = {10.1016/J.JPDC.2021.03.001},
  timestamp    = {Mon, 05 Feb 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/jpdc/KumarGPWSFET21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/europar/KumarE21,
  author       = {Mohit Kumar and
                  Christian Engelmann},
  editor       = {Ricardo Chaves and
                  Dora B. Heras and
                  Aleksandar Ilic and
                  Didem Unat and
                  Rosa M. Badia and
                  Andrea Bracciali and
                  Patrick Diehl and
                  Anshu Dubey and
                  Oh Sangyoon and
                  Stephen L. Scott and
                  Laura Ricci},
  title        = {{RDPM:} An Extensible Tool for Resilience Design Patterns Modelling},
  booktitle    = {Euro-Par 2021: Parallel Processing Workshops - Euro-Par 2021 International
                  Workshops, Lisbon, Portugal, August 30-31, 2021, Revised Selected
                  Papers},
  series       = {Lecture Notes in Computer Science},
  volume       = {13098},
  pages        = {283--297},
  publisher    = {Springer},
  year         = {2021},
  url          = {https://doi.org/10.1007/978-3-031-06156-1\_23},
  doi          = {10.1007/978-3-031-06156-1\_23},
  timestamp    = {Tue, 21 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/europar/KumarE21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/europar/JeongYGELCRG20,
  author       = {Haewon Jeong and
                  Yaoqing Yang and
                  Vipul Gupta and
                  Christian Engelmann and
                  Tze Meng Low and
                  Viveck R. Cadambe and
                  Kannan Ramchandran and
                  Pulkit Grover},
  editor       = {Maciej Malawski and
                  Krzysztof Rzadca},
  title        = {3D Coded {SUMMA:} Communication-Efficient and Robust Parallel Matrix
                  Multiplication},
  booktitle    = {Euro-Par 2020: Parallel Processing - 26th International Conference
                  on Parallel and Distributed Computing, Warsaw, Poland, August 24-28,
                  2020, Proceedings},
  series       = {Lecture Notes in Computer Science},
  volume       = {12247},
  pages        = {392--407},
  publisher    = {Springer},
  year         = {2020},
  url          = {https://doi.org/10.1007/978-3-030-57675-2\_25},
  doi          = {10.1007/978-3-030-57675-2\_25},
  timestamp    = {Tue, 07 May 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/europar/JeongYGELCRG20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/prdc/HukerikarE20,
  author       = {Saurabh Hukerikar and
                  Christian Engelmann},
  title        = {{PLEXUS:} {A} Pattern-Oriented Runtime System Architecture for Resilient
                  Extreme-Scale High-Performance Computing Systems},
  booktitle    = {25th {IEEE} Pacific Rim International Symposium on Dependable Computing,
                  {PRDC} 2020, Perth, Australia, December 1-4, 2020},
  pages        = {31--39},
  publisher    = {{IEEE}},
  year         = {2020},
  url          = {https://doi.org/10.1109/PRDC50213.2020.00014},
  doi          = {10.1109/PRDC50213.2020.00014},
  timestamp    = {Fri, 09 Apr 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/prdc/HukerikarE20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sc/KumarE20,
  author       = {Mohit Kumar and
                  Christian Engelmann},
  title        = {Models for Resilience Design Patterns},
  booktitle    = {10th {IEEE/ACM} Workshop on Fault Tolerance for {HPC} at eXtreme Scale,
                  FTXS@SC 2020, Atlanta, GA, USA, November 11, 2020},
  pages        = {21--30},
  publisher    = {{IEEE}},
  year         = {2020},
  url          = {https://doi.org/10.1109/FTXS51974.2020.00008},
  doi          = {10.1109/FTXS51974.2020.00008},
  timestamp    = {Fri, 30 Apr 2021 12:35:39 +0200},
  biburl       = {https://dblp.org/rec/conf/sc/KumarE20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sc/OstrouchovMAESR20,
  author       = {George Ostrouchov and
                  Don Maxwell and
                  Rizwan A. Ashraf and
                  Christian Engelmann and
                  Mallikarjun Shankar and
                  James H. Rogers},
  editor       = {Christine Cuicchi and
                  Irene Qualters and
                  William T. Kramer},
  title        = {{GPU} lifetimes on titan supercomputer: survival analysis and reliability},
  booktitle    = {Proceedings of the International Conference for High Performance Computing,
                  Networking, Storage and Analysis, {SC} 2020, Virtual Event / Atlanta,
                  Georgia, USA, November 9-19, 2020},
  pages        = {41},
  publisher    = {{IEEE/ACM}},
  year         = {2020},
  url          = {https://doi.org/10.1109/SC41405.2020.00045},
  doi          = {10.1109/SC41405.2020.00045},
  timestamp    = {Wed, 04 May 2022 13:02:27 +0200},
  biburl       = {https://dblp.org/rec/conf/sc/OstrouchovMAESR20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2010-13342,
  author       = {Emmanuel Agullo and
                  Mirco Altenbernd and
                  Hartwig Anzt and
                  Leonardo Bautista{-}Gomez and
                  Tommaso Benacchio and
                  Luca Bonaventura and
                  Hans{-}Joachim Bungartz and
                  Sanjay Chatterjee and
                  Florina M. Ciorba and
                  Nathan DeBardeleben and
                  Daniel Drzisga and
                  Sebastian Eibl and
                  Christian Engelmann and
                  Wilfried N. Gansterer and
                  Luc Giraud and
                  Dominik G{\"{o}}ddeke and
                  Marco Heisig and
                  Fabienne J{\'{e}}z{\'{e}}quel and
                  Nils Kohl and
                  Xiaoye Sherry Li and
                  Romain Lion and
                  Miriam Mehl and
                  Paul Mycek and
                  Michael Obersteiner and
                  Enrique S. Quintana{-}Ort{\'{\i}} and
                  Francesco Rizzi and
                  Ulrich R{\"{u}}de and
                  Martin Schulz and
                  Fred Fung and
                  Robert Speck and
                  Linda Stals and
                  Keita Teranishi and
                  Samuel Thibault and
                  Dominik Th{\"{o}}nnes and
                  Andreas Wagner and
                  Barbara I. Wohlmuth},
  title        = {Resiliency in Numerical Algorithm Design for Extreme Scale Simulations},
  journal      = {CoRR},
  volume       = {abs/2010.13342},
  year         = {2020},
  url          = {https://arxiv.org/abs/2010.13342},
  eprinttype    = {arXiv},
  eprint       = {2010.13342},
  timestamp    = {Wed, 04 Nov 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2010-13342.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iwomp/EngelmannVP19,
  author       = {Christian Engelmann and
                  Geoffroy R. Vall{\'{e}}e and
                  Swaroop Pophale},
  editor       = {Xing Fan and
                  Bronis R. de Supinski and
                  Oliver Sinnen and
                  Nasser Giacaman},
  title        = {Concepts for OpenMP Target Offload Resilience},
  booktitle    = {OpenMP: Conquering the Full Hardware Spectrum - 15th International
                  Workshop on OpenMP, {IWOMP} 2019, Auckland, New Zealand, September
                  11-13, 2019, Proceedings},
  series       = {Lecture Notes in Computer Science},
  volume       = {11718},
  pages        = {78--93},
  publisher    = {Springer},
  year         = {2019},
  url          = {https://doi.org/10.1007/978-3-030-28596-8\_6},
  doi          = {10.1007/978-3-030-28596-8\_6},
  timestamp    = {Fri, 27 Dec 2019 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/iwomp/EngelmannVP19.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sc/SaoEEGV19,
  author       = {Piyush Sao and
                  Christian Engelmann and
                  Srinivas Eswar and
                  Oded Green and
                  Richard W. Vuduc},
  title        = {Self-stabilizing Connected Components},
  booktitle    = {9th {IEEE/ACM} Workshop on Fault Tolerance for {HPC} at eXtreme Scale,
                  FTXS@SC 2019, Denver, CO, USA, November 22, 2019},
  pages        = {50--59},
  publisher    = {{IEEE}},
  year         = {2019},
  url          = {https://doi.org/10.1109/FTXS49593.2019.00011},
  doi          = {10.1109/FTXS49593.2019.00011},
  timestamp    = {Tue, 07 May 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/sc/SaoEEGV19.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/ijhpca/KattiFNE18,
  author       = {Amogh Katti and
                  Giuseppe Di Fatta and
                  Thomas J. Naughton and
                  Christian Engelmann},
  title        = {Epidemic failure detection and consensus for extreme parallelism},
  journal      = {Int. J. High Perform. Comput. Appl.},
  volume       = {32},
  number       = {5},
  pages        = {729--743},
  year         = {2018},
  url          = {https://doi.org/10.1177/1094342017690910},
  doi          = {10.1177/1094342017690910},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/ijhpca/KattiFNE18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/bigdataconf/HuiAPE18,
  author       = {Yawei Hui and
                  Rizwan A. Ashraf and
                  Byung H. Park and
                  Christian Engelmann},
  editor       = {Naoki Abe and
                  Huan Liu and
                  Calton Pu and
                  Xiaohua Hu and
                  Nesreen K. Ahmed and
                  Mu Qiao and
                  Yang Song and
                  Donald Kossmann and
                  Bing Liu and
                  Kisung Lee and
                  Jiliang Tang and
                  Jingrui He and
                  Jeffrey S. Saltz},
  title        = {Real-Time Assessment of Supercomputer Status by a Comprehensive Informative
                  Metric through Streaming Processing},
  booktitle    = {{IEEE} International Conference on Big Data {(IEEE} BigData 2018),
                  Seattle, WA, USA, December 10-13, 2018},
  pages        = {5339--5341},
  publisher    = {{IEEE}},
  year         = {2018},
  url          = {https://doi.org/10.1109/BigData.2018.8621862},
  doi          = {10.1109/BIGDATA.2018.8621862},
  timestamp    = {Fri, 19 Nov 2021 16:08:20 +0100},
  biburl       = {https://dblp.org/rec/conf/bigdataconf/HuiAPE18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cluster/ParkHBALE18,
  author       = {Byung H. Park and
                  Yawei Hui and
                  Swen Boehm and
                  Rizwan A. Ashraf and
                  Christopher Layton and
                  Christian Engelmann},
  title        = {A Big Data Analytics Framework for {HPC} Log Data: Three Case Studies
                  Using the Titan Supercomputer Log},
  booktitle    = {{IEEE} International Conference on Cluster Computing, {CLUSTER} 2018,
                  Belfast, UK, September 10-13, 2018},
  pages        = {571--579},
  publisher    = {{IEEE} Computer Society},
  year         = {2018},
  url          = {https://doi.org/10.1109/CLUSTER.2018.00073},
  doi          = {10.1109/CLUSTER.2018.00073},
  timestamp    = {Thu, 23 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cluster/ParkHBALE18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/dsn/NieXGPEST18,
  author       = {Bin Nie and
                  Ji Xue and
                  Saurabh Gupta and
                  Tirthak Patel and
                  Christian Engelmann and
                  Evgenia Smirni and
                  Devesh Tiwari},
  title        = {Machine Learning Models for {GPU} Error Prediction in a Large Scale
                  {HPC} System},
  booktitle    = {48th Annual {IEEE/IFIP} International Conference on Dependable Systems
                  and Networks, {DSN} 2018, Luxembourg City, Luxembourg, June 25-28,
                  2018},
  pages        = {95--106},
  publisher    = {{IEEE} Computer Society},
  year         = {2018},
  url          = {https://doi.org/10.1109/DSN.2018.00022},
  doi          = {10.1109/DSN.2018.00022},
  timestamp    = {Mon, 05 Feb 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/dsn/NieXGPEST18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/dsn/KumarGPWSFET18,
  author       = {Mohit Kumar and
                  Saurabh Gupta and
                  Tirthak Patel and
                  Michael Wilder and
                  Weisong Shi and
                  Song Fu and
                  Christian Engelmann and
                  Devesh Tiwari},
  title        = {Understanding and Analyzing Interconnect Errors and Network Congestion
                  on a Large Scale {HPC} System},
  booktitle    = {48th Annual {IEEE/IFIP} International Conference on Dependable Systems
                  and Networks, {DSN} 2018, Luxembourg City, Luxembourg, June 25-28,
                  2018},
  pages        = {107--114},
  publisher    = {{IEEE} Computer Society},
  year         = {2018},
  url          = {https://doi.org/10.1109/DSN.2018.00023},
  doi          = {10.1109/DSN.2018.00023},
  timestamp    = {Mon, 05 Feb 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/dsn/KumarGPWSFET18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/europar/AshrafE18,
  author       = {Rizwan A. Ashraf and
                  Christian Engelmann},
  editor       = {Gabriele Mencagli and
                  Dora B. Heras and
                  Valeria Cardellini and
                  Emiliano Casalicchio and
                  Emmanuel Jeannot and
                  Felix Wolf and
                  Antonio Salis and
                  Claudio Schifanella and
                  Ravi Reddy Manumachu and
                  Laura Ricci and
                  Marco Beccuti and
                  Laura Antonelli and
                  Jos{\'{e}} Daniel Garc{\'{\i}}a S{\'{a}}nchez and
                  Stephen L. Scott},
  title        = {Performance Efficient Multiresilience Using Checkpoint Recovery in
                  Iterative Algorithms},
  booktitle    = {Euro-Par 2018: Parallel Processing Workshops - Euro-Par 2018 International
                  Workshops, Turin, Italy, August 27-28, 2018, Revised Selected Papers},
  series       = {Lecture Notes in Computer Science},
  volume       = {11339},
  pages        = {813--825},
  publisher    = {Springer},
  year         = {2018},
  url          = {https://doi.org/10.1007/978-3-030-10549-5\_63},
  doi          = {10.1007/978-3-030-10549-5\_63},
  timestamp    = {Fri, 27 Dec 2019 21:26:53 +0100},
  biburl       = {https://dblp.org/rec/conf/europar/AshrafE18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ldav/HuiPE18,
  author       = {Yawei Hui and
                  Byung{-}Hoon Park and
                  Christian Engelmann},
  title        = {A Comprehensive Informative Metric for Summarizing {HPC} System Status},
  booktitle    = {8th {IEEE} Symposium on Large Data Analysis and Visualization, {LDAV}
                  2018, Berlin, Germany, October 21, 2018},
  pages        = {102--103},
  publisher    = {{IEEE}},
  year         = {2018},
  url          = {https://doi.org/10.1109/LDAV.2018.8739234},
  doi          = {10.1109/LDAV.2018.8739234},
  timestamp    = {Fri, 27 Dec 2019 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/ldav/HuiPE18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/pdp/AshrafHE18,
  author       = {Rizwan A. Ashraf and
                  Saurabh Hukerikar and
                  Christian Engelmann},
  editor       = {Ivan Merelli and
                  Pietro Li{\`{o}} and
                  Igor V. Kotenko},
  title        = {Shrink or Substitute: Handling Process Failures in {HPC} Systems Using
                  In-Situ Recovery},
  booktitle    = {26th Euromicro International Conference on Parallel, Distributed and
                  Network-based Processing, {PDP} 2018, Cambridge, United Kingdom, March
                  21-23, 2018},
  pages        = {178--185},
  publisher    = {{IEEE} Computer Society},
  year         = {2018},
  url          = {https://doi.org/10.1109/PDP2018.2018.00032},
  doi          = {10.1109/PDP2018.2018.00032},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/pdp/AshrafHE18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sc/HuiPE18,
  author       = {Yawei Hui and
                  Byung{-}Hoon Park and
                  Christian Engelmann},
  title        = {A Comprehensive Informative Metric for Analyzing {HPC} System Status
                  Using the LogSCAN Platform},
  booktitle    = {{IEEE/ACM} 8th Workshop on Fault Tolerance for {HPC} at eXtreme Scale,
                  FTXS@SC 2018, Dallas, TX, USA, November 16, 2018},
  pages        = {29--38},
  publisher    = {{IEEE}},
  year         = {2018},
  url          = {https://doi.org/10.1109/FTXS.2018.00007},
  doi          = {10.1109/FTXS.2018.00007},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/sc/HuiPE18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sc/AshrafE18,
  author       = {Rizwan A. Ashraf and
                  Christian Engelmann},
  title        = {Analyzing the Impact of System Reliability Events on Applications
                  in the Titan Supercomputer},
  booktitle    = {{IEEE/ACM} 8th Workshop on Fault Tolerance for {HPC} at eXtreme Scale,
                  FTXS@SC 2018, Dallas, TX, USA, November 16, 2018},
  pages        = {39--48},
  publisher    = {{IEEE}},
  year         = {2018},
  url          = {https://doi.org/10.1109/FTXS.2018.00008},
  doi          = {10.1109/FTXS.2018.00008},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/sc/AshrafE18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/wosp/AshrafHE18,
  author       = {Rizwan A. Ashraf and
                  Saurabh Hukerikar and
                  Christian Engelmann},
  editor       = {Katinka Wolter and
                  William J. Knottenbelt and
                  Andr{\'{e}} van Hoorn and
                  Manoj Nambiar},
  title        = {Pattern-based Modeling of Multiresilience Solutions for High-Performance
                  Computing},
  booktitle    = {Proceedings of the 2018 {ACM/SPEC} International Conference on Performance
                  Engineering, {ICPE} 2018, Berlin, Germany, April 09-13, 2018},
  pages        = {80--87},
  publisher    = {{ACM}},
  year         = {2018},
  url          = {https://doi.org/10.1145/3184407.3184421},
  doi          = {10.1145/3184407.3184421},
  timestamp    = {Fri, 02 Jun 2023 16:15:08 +0200},
  biburl       = {https://dblp.org/rec/conf/wosp/AshrafHE18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1801-04523,
  author       = {Rizwan A. Ashraf and
                  Saurabh Hukerikar and
                  Christian Engelmann},
  title        = {Shrink or Substitute: Handling Process Failures in {HPC} Systems using
                  In-situ Recovery},
  journal      = {CoRR},
  volume       = {abs/1801.04523},
  year         = {2018},
  url          = {http://arxiv.org/abs/1801.04523},
  eprinttype    = {arXiv},
  eprint       = {1801.04523},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1801-04523.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1802-08233,
  author       = {Rizwan A. Ashraf and
                  Saurabh Hukerikar and
                  Christian Engelmann},
  title        = {Pattern-based Modeling of Multiresilience Solutions for High-Performance
                  Computing},
  journal      = {CoRR},
  volume       = {abs/1802.08233},
  year         = {2018},
  url          = {http://arxiv.org/abs/1802.08233},
  eprinttype    = {arXiv},
  eprint       = {1802.08233},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1802-08233.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/superfri/HukerikarE17,
  author       = {Saurabh Hukerikar and
                  Christian Engelmann},
  title        = {Resilience Design Patterns: {A} Structured Approach to Resilience
                  at Extreme Scale},
  journal      = {Supercomput. Front. Innov.},
  volume       = {4},
  number       = {3},
  pages        = {4--42},
  year         = {2017},
  url          = {https://doi.org/10.14529/jsfi170301},
  doi          = {10.14529/JSFI170301},
  timestamp    = {Tue, 29 Dec 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/superfri/HukerikarE17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cluster/ParkHAE17,
  author       = {Byung H. Park and
                  Saurabh Hukerikar and
                  Ryan Adamson and
                  Christian Engelmann},
  title        = {Big Data Meets {HPC} Log Analytics: Scalable Approach to Understanding
                  Systems at Extreme Scale},
  booktitle    = {2017 {IEEE} International Conference on Cluster Computing, {CLUSTER}
                  2017, Honolulu, HI, USA, September 5-8, 2017},
  pages        = {758--765},
  publisher    = {{IEEE} Computer Society},
  year         = {2017},
  url          = {https://doi.org/10.1109/CLUSTER.2017.113},
  doi          = {10.1109/CLUSTER.2017.113},
  timestamp    = {Thu, 23 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cluster/ParkHAE17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/europar/HukerikarE17,
  author       = {Saurabh Hukerikar and
                  Christian Engelmann},
  editor       = {Dora Blanco Heras and
                  Luc Boug{\'{e}} and
                  Gabriele Mencagli and
                  Emmanuel Jeannot and
                  Rizos Sakellariou and
                  Rosa M. Badia and
                  Jorge G. Barbosa and
                  Laura Ricci and
                  Stephen L. Scott and
                  Stefan Lankes and
                  Josef Weidendorfer},
  title        = {Pattern-Based Modeling of High-Performance Computing Resilience},
  booktitle    = {Euro-Par 2017: Parallel Processing Workshops - Euro-Par 2017 International
                  Workshops, Santiago de Compostela, Spain, August 28-29, 2017, Revised
                  Selected Papers},
  series       = {Lecture Notes in Computer Science},
  volume       = {10659},
  pages        = {557--568},
  publisher    = {Springer},
  year         = {2017},
  url          = {https://doi.org/10.1007/978-3-319-75178-8\_45},
  doi          = {10.1007/978-3-319-75178-8\_45},
  timestamp    = {Thu, 14 Oct 2021 10:28:38 +0200},
  biburl       = {https://dblp.org/rec/conf/europar/HukerikarE17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/europlop/HukerikarE17,
  author       = {Saurabh Hukerikar and
                  Christian Engelmann},
  title        = {A Pattern Language for High-Performance Computing Resilience},
  booktitle    = {Proceedings of the 22nd European Conference on Pattern Languages of
                  Programs, EuroPLoP 2017, Irsee, Germany, July 12-16, 2017},
  pages        = {12:1--12:16},
  publisher    = {{ACM}},
  year         = {2017},
  url          = {https://doi.org/10.1145/3147704.3147718},
  doi          = {10.1145/3147704.3147718},
  timestamp    = {Tue, 06 Nov 2018 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/europlop/HukerikarE17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/hpdc/HukerikarAE17,
  author       = {Saurabh Hukerikar and
                  Rizwan A. Ashraf and
                  Christian Engelmann},
  title        = {Towards New Metrics for High-Performance Computing Resilience},
  booktitle    = {Proceedings of the {ACM} Workshop on Fault-Tolerance for {HPC} at
                  Extreme Scale, FTXS@HPDC 2017, Washington, DC, USA, June, 2017},
  pages        = {23--30},
  publisher    = {{ACM}},
  year         = {2017},
  url          = {https://doi.org/10.1145/3086157.3086163},
  doi          = {10.1145/3086157.3086163},
  timestamp    = {Wed, 14 Nov 2018 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/hpdc/HukerikarAE17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mascots/NieXGEST17,
  author       = {Bin Nie and
                  Ji Xue and
                  Saurabh Gupta and
                  Christian Engelmann and
                  Evgenia Smirni and
                  Devesh Tiwari},
  title        = {Characterizing Temperature, Power, and Soft-Error Behaviors in Data
                  Center Systems: Insights, Challenges, and Opportunities},
  booktitle    = {25th {IEEE} International Symposium on Modeling, Analysis, and Simulation
                  of Computer and Telecommunication Systems, {MASCOTS} 2017, Banff,
                  AB, Canada, September 20-22, 2017},
  pages        = {22--31},
  publisher    = {{IEEE} Computer Society},
  year         = {2017},
  url          = {https://doi.org/10.1109/MASCOTS.2017.12},
  doi          = {10.1109/MASCOTS.2017.12},
  timestamp    = {Thu, 23 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/mascots/NieXGEST17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sc/GuptaPET17,
  author       = {Saurabh Gupta and
                  Tirthak Patel and
                  Christian Engelmann and
                  Devesh Tiwari},
  editor       = {Bernd Mohr and
                  Padma Raghavan},
  title        = {Failures in large scale systems: long-term measurement, analysis,
                  and implications},
  booktitle    = {Proceedings of the International Conference for High Performance Computing,
                  Networking, Storage and Analysis, {SC} 2017, Denver, CO, USA, November
                  12 - 17, 2017},
  pages        = {44},
  publisher    = {{ACM}},
  year         = {2017},
  url          = {https://doi.org/10.1145/3126908.3126937},
  doi          = {10.1145/3126908.3126937},
  timestamp    = {Mon, 05 Feb 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/sc/GuptaPET17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1708-06884,
  author       = {Byung H. Park and
                  Saurabh Hukerikar and
                  Ryan Adamson and
                  Christian Engelmann},
  title        = {Big Data Meets {HPC} Log Analytics: Scalable Approach to Understanding
                  Systems at Extreme Scale},
  journal      = {CoRR},
  volume       = {abs/1708.06884},
  year         = {2017},
  url          = {http://arxiv.org/abs/1708.06884},
  eprinttype    = {arXiv},
  eprint       = {1708.06884},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1708-06884.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1708-07422,
  author       = {Saurabh Hukerikar and
                  Christian Engelmann},
  title        = {Resilience Design Patterns: {A} Structured Approach to Resilience
                  at Extreme Scale},
  journal      = {CoRR},
  volume       = {abs/1708.07422},
  year         = {2017},
  url          = {http://arxiv.org/abs/1708.07422},
  eprinttype    = {arXiv},
  eprint       = {1708.07422},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1708-07422.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1710-02627,
  author       = {Saurabh Hukerikar and
                  Christian Engelmann},
  title        = {Pattern-based Modeling of High-Performance Computing Resilience},
  journal      = {CoRR},
  volume       = {abs/1710.02627},
  year         = {2017},
  url          = {http://arxiv.org/abs/1710.02627},
  eprinttype    = {arXiv},
  eprint       = {1710.02627},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1710-02627.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1710-09074,
  author       = {Saurabh Hukerikar and
                  Christian Engelmann},
  title        = {A Pattern Language for High-Performance Computing Resilience},
  journal      = {CoRR},
  volume       = {abs/1710.09074},
  year         = {2017},
  url          = {http://arxiv.org/abs/1710.09074},
  eprinttype    = {arXiv},
  eprint       = {1710.09074},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1710-09074.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/concurrency/EngelmannN16,
  author       = {Christian Engelmann and
                  Thomas J. Naughton},
  title        = {A new deadlock resolution protocol and message matching algorithm
                  for the extreme-scale simulator},
  journal      = {Concurr. Comput. Pract. Exp.},
  volume       = {28},
  number       = {12},
  pages        = {3369--3389},
  year         = {2016},
  url          = {https://doi.org/10.1002/cpe.3805},
  doi          = {10.1002/CPE.3805},
  timestamp    = {Mon, 02 Mar 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/concurrency/EngelmannN16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/dsn/TangTGHLEH16,
  author       = {Kun Tang and
                  Devesh Tiwari and
                  Saurabh Gupta and
                  Ping Huang and
                  Qiqi Lu and
                  Christian Engelmann and
                  Xubin He},
  title        = {Power-Capping Aware Checkpointing: On the Interplay Among Power-Capping,
                  Temperature, Reliability, Performance, and Energy},
  booktitle    = {46th Annual {IEEE/IFIP} International Conference on Dependable Systems
                  and Networks, {DSN} 2016, Toulouse, France, June 28 - July 1, 2016},
  pages        = {311--322},
  publisher    = {{IEEE} Computer Society},
  year         = {2016},
  url          = {https://doi.org/10.1109/DSN.2016.36},
  doi          = {10.1109/DSN.2016.36},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/dsn/TangTGHLEH16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/dsrt/LagadapatiME16,
  author       = {Mahesh Lagadapati and
                  Frank Mueller and
                  Christian Engelmann},
  title        = {Benchmark Generation and Simulation at Extreme Scale},
  booktitle    = {20th {IEEE/ACM} International Symposium on Distributed Simulation
                  and Real Time Applications, {DS-RT} 2016, London, United Kingdom,
                  September 21-23, 2016},
  pages        = {9--18},
  publisher    = {{IEEE} Computer Society},
  year         = {2016},
  url          = {https://doi.org/10.1109/DS-RT.2016.18},
  doi          = {10.1109/DS-RT.2016.18},
  timestamp    = {Thu, 23 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/dsrt/LagadapatiME16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/europar/NaughtonEVAS16,
  author       = {Thomas J. Naughton and
                  Christian Engelmann and
                  Geoffroy Vall{\'{e}}e and
                  Ferrol Aderholdt and
                  Stephen L. Scott},
  editor       = {Fr{\'{e}}d{\'{e}}ric Desprez and
                  Pierre{-}Fran{\c{c}}ois Dutot and
                  Christos Kaklamanis and
                  Loris Marchal and
                  Korbinian Molitorisz and
                  Laura Ricci and
                  Vittorio Scarano and
                  Miguel A. Vega{-}Rodr{\'{\i}}guez and
                  Ana Lucia Varbanescu and
                  Sascha Hunold and
                  Stephen L. Scott and
                  Stefan Lankes and
                  Josef Weidendorfer},
  title        = {A Cooperative Approach to Virtual Machine Based Fault Injection},
  booktitle    = {Euro-Par 2016: Parallel Processing Workshops - Euro-Par 2016 International
                  Workshops, Grenoble, France, August 24-26, 2016, Revised Selected
                  Papers},
  series       = {Lecture Notes in Computer Science},
  volume       = {10104},
  pages        = {671--682},
  publisher    = {Springer},
  year         = {2016},
  url          = {https://doi.org/10.1007/978-3-319-58943-5\_54},
  doi          = {10.1007/978-3-319-58943-5\_54},
  timestamp    = {Sun, 12 Nov 2023 02:07:45 +0100},
  biburl       = {https://dblp.org/rec/conf/europar/NaughtonEVAS16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/hpdc/ParchmanVNEBS16,
  author       = {Zachary W. Parchman and
                  Geoffroy Vall{\'{e}}e and
                  Thomas J. Naughton and
                  Christian Engelmann and
                  David E. Bernholdt and
                  Stephen L. Scott},
  editor       = {Nathan DeBardeleben},
  title        = {Adding Fault Tolerance to {NPB} Benchmarks Using {ULFM}},
  booktitle    = {Proceedings of the {ACM} Workshop on Fault-Tolerance for {HPC} at
                  Extreme Scale, FTXS@HPDC 2016, Kyoto, Japan, May 31, 2016},
  pages        = {27--34},
  publisher    = {{ACM}},
  year         = {2016},
  url          = {https://doi.org/10.1145/2909428.2909434},
  doi          = {10.1145/2909428.2909434},
  timestamp    = {Wed, 07 Dec 2022 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/hpdc/ParchmanVNEBS16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/hpec/HukerikarE16,
  author       = {Saurabh Hukerikar and
                  Christian Engelmann},
  title        = {Havens: Explicit reliable memory regions for {HPC} applications},
  booktitle    = {2016 {IEEE} High Performance Extreme Computing Conference, {HPEC}
                  2016, Waltham, MA, USA, September 13-15, 2016},
  pages        = {1--6},
  publisher    = {{IEEE}},
  year         = {2016},
  url          = {https://doi.org/10.1109/HPEC.2016.7761593},
  doi          = {10.1109/HPEC.2016.7761593},
  timestamp    = {Wed, 16 Oct 2019 14:14:52 +0200},
  biburl       = {https://dblp.org/rec/conf/hpec/HukerikarE16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ics/FialaMFE16,
  author       = {David Fiala and
                  Frank Mueller and
                  Kurt B. Ferreira and
                  Christian Engelmann},
  editor       = {Ozcan Ozturk and
                  Kemal Ebcioglu and
                  Mahmut T. Kandemir and
                  Onur Mutlu},
  title        = {Mini-Ckpts: Surviving {OS} Failures in Persistent Memory},
  booktitle    = {Proceedings of the 2016 International Conference on Supercomputing,
                  {ICS} 2016, Istanbul, Turkey, June 1-3, 2016},
  pages        = {7:1--7:14},
  publisher    = {{ACM}},
  year         = {2016},
  url          = {https://doi.org/10.1145/2925426.2926295},
  doi          = {10.1145/2925426.2926295},
  timestamp    = {Mon, 22 Mar 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/ics/FialaMFE16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ipps/Bautista-GomezG16,
  author       = {Leonardo Arturo Bautista{-}Gomez and
                  Ana Gainaru and
                  Swann Perarnau and
                  Devesh Tiwari and
                  Saurabh Gupta and
                  Christian Engelmann and
                  Franck Cappello and
                  Marc Snir},
  title        = {Reducing Waste in Extreme Scale Systems through Introspective Analysis},
  booktitle    = {2016 {IEEE} International Parallel and Distributed Processing Symposium,
                  {IPDPS} 2016, Chicago, IL, USA, May 23-27, 2016},
  pages        = {212--221},
  publisher    = {{IEEE} Computer Society},
  year         = {2016},
  url          = {https://doi.org/10.1109/IPDPS.2016.100},
  doi          = {10.1109/IPDPS.2016.100},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/ipps/Bautista-GomezG16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/lcpc/HukerikarE16,
  author       = {Saurabh Hukerikar and
                  Christian Engelmann},
  editor       = {Chen Ding and
                  John Criswell and
                  Peng Wu},
  title        = {Language Support for Reliable Memory Regions},
  booktitle    = {Languages and Compilers for Parallel Computing - 29th International
                  Workshop, {LCPC} 2016, Rochester, NY, USA, September 28-30, 2016,
                  Revised Papers},
  series       = {Lecture Notes in Computer Science},
  volume       = {10136},
  pages        = {73--87},
  publisher    = {Springer},
  year         = {2016},
  url          = {https://doi.org/10.1007/978-3-319-52709-3\_6},
  doi          = {10.1007/978-3-319-52709-3\_6},
  timestamp    = {Tue, 28 Jul 2020 07:46:49 +0200},
  biburl       = {https://dblp.org/rec/conf/lcpc/HukerikarE16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/HukerikarE16,
  author       = {Saurabh Hukerikar and
                  Christian Engelmann},
  title        = {Havens: Explicit Reliable Memory Regions for {HPC} Applications},
  journal      = {CoRR},
  volume       = {abs/1610.08494},
  year         = {2016},
  url          = {http://arxiv.org/abs/1610.08494},
  eprinttype    = {arXiv},
  eprint       = {1610.08494},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/HukerikarE16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/HukerikarE16a,
  author       = {Saurabh Hukerikar and
                  Christian Engelmann},
  title        = {Resilience Design Patterns - {A} Structured Approach to Resilience
                  at Extreme Scale},
  journal      = {CoRR},
  volume       = {abs/1611.02717},
  year         = {2016},
  url          = {http://arxiv.org/abs/1611.02717},
  eprinttype    = {arXiv},
  eprint       = {1611.02717},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/HukerikarE16a.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/HukerikarE16b,
  author       = {Saurabh Hukerikar and
                  Christian Engelmann},
  title        = {Language Support for Reliable Memory Regions},
  journal      = {CoRR},
  volume       = {abs/1611.02823},
  year         = {2016},
  url          = {http://arxiv.org/abs/1611.02823},
  eprinttype    = {arXiv},
  eprint       = {1611.02823},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/HukerikarE16b.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/pvm/KattiFNE15,
  author       = {Amogh Katti and
                  Giuseppe Di Fatta and
                  Thomas J. Naughton and
                  Christian Engelmann},
  editor       = {Jack J. Dongarra and
                  Alexandre Denis and
                  Brice Goglin and
                  Emmanuel Jeannot and
                  Guillaume Mercier},
  title        = {Scalable and Fault Tolerant Failure Detection and Consensus},
  booktitle    = {Proceedings of the 22nd European {MPI} Users' Group Meeting, EuroMPI
                  2015, Bordeaux, France, September 21-23, 2015},
  pages        = {13:1--13:9},
  publisher    = {{ACM}},
  year         = {2015},
  url          = {https://doi.org/10.1145/2802658.2802660},
  doi          = {10.1145/2802658.2802660},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/pvm/KattiFNE15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/fgcs/Engelmann14,
  author       = {Christian Engelmann},
  title        = {Scaling to a million cores and beyond: Using light-weight simulation
                  to understand the challenges ahead on the road to exascale},
  journal      = {Future Gener. Comput. Syst.},
  volume       = {30},
  pages        = {59--65},
  year         = {2014},
  url          = {https://doi.org/10.1016/j.future.2013.04.014},
  doi          = {10.1016/J.FUTURE.2013.04.014},
  timestamp    = {Wed, 19 Feb 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/fgcs/Engelmann14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/ijhpca/SnirWAABBBBCCCCDDEEFGGJKLLMMSSH14,
  author       = {Marc Snir and
                  Robert W. Wisniewski and
                  Jacob A. Abraham and
                  Sarita V. Adve and
                  Saurabh Bagchi and
                  Pavan Balaji and
                  James F. Belak and
                  Pradip Bose and
                  Franck Cappello and
                  Bill Carlson and
                  Andrew A. Chien and
                  Paul Coteus and
                  Nathan DeBardeleben and
                  Pedro C. Diniz and
                  Christian Engelmann and
                  Mattan Erez and
                  Saverio Fazzari and
                  Al Geist and
                  Rinku Gupta and
                  Fred Johnson and
                  Sriram Krishnamoorthy and
                  Sven Leyffer and
                  Dean Liberty and
                  Subhasish Mitra and
                  Todd S. Munson and
                  Rob Schreiber and
                  Jon Stearley and
                  Eric Van Hensbergen},
  title        = {Addressing failures in exascale computing},
  journal      = {Int. J. High Perform. Comput. Appl.},
  volume       = {28},
  number       = {2},
  pages        = {129--173},
  year         = {2014},
  url          = {https://doi.org/10.1177/1094342014522573},
  doi          = {10.1177/1094342014522573},
  timestamp    = {Mon, 05 Feb 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/ijhpca/SnirWAABBBBCCCCDDEEFGGJKLLMMSSH14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/dsrt/EngelmannN14,
  author       = {Christian Engelmann and
                  Thomas J. Naughton},
  title        = {Improving the Performance of the Extreme-Scale Simulator},
  booktitle    = {18th {IEEE/ACM} International Symposium on Distributed Simulation
                  and Real Time Applications, {DS-RT} 2014, Toulouse, France, October
                  1-3, 2014},
  pages        = {198--207},
  publisher    = {{IEEE} Computer Society},
  year         = {2014},
  url          = {https://doi.org/10.1109/DS-RT.2014.32},
  doi          = {10.1109/DS-RT.2014.32},
  timestamp    = {Thu, 23 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/dsrt/EngelmannN14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/europar/NaughtonSEVAS14,
  author       = {Thomas J. Naughton and
                  Garry Smith and
                  Christian Engelmann and
                  Geoffroy Vall{\'{e}}e and
                  Ferrol Aderholdt and
                  Stephen L. Scott},
  editor       = {Lu{\'{\i}}s M. B. Lopes and
                  Julius Zilinskas and
                  Alexandru Costan and
                  Roberto G. Cascella and
                  Gabor Kecskemeti and
                  Emmanuel Jeannot and
                  Mario Cannataro and
                  Laura Ricci and
                  Siegfried Benkner and
                  Salvador Petit and
                  Vittorio Scarano and
                  Jos{\'{e}} Gracia and
                  Sascha Hunold and
                  Stephen L. Scott and
                  Stefan Lankes and
                  Christian Lengauer and
                  Jes{\'{u}}s Carretero and
                  Jens Breitbart and
                  Michael Alexander},
  title        = {What Is the Right Balance for Performance and Isolation with Virtualization
                  in HPC?},
  booktitle    = {Euro-Par 2014: Parallel Processing Workshops - Euro-Par 2014 International
                  Workshops, Porto, Portugal, August 25-26, 2014, Revised Selected Papers,
                  Part {I}},
  series       = {Lecture Notes in Computer Science},
  volume       = {8805},
  pages        = {570--581},
  publisher    = {Springer},
  year         = {2014},
  url          = {https://doi.org/10.1007/978-3-319-14325-5\_49},
  doi          = {10.1007/978-3-319-14325-5\_49},
  timestamp    = {Sun, 12 Nov 2023 02:07:45 +0100},
  biburl       = {https://dblp.org/rec/conf/europar/NaughtonSEVAS14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/pdp/NaughtonEVB14,
  author       = {Thomas J. Naughton and
                  Christian Engelmann and
                  Geoffroy Vall{\'{e}}e and
                  Swen B{\"{o}}hm},
  title        = {Supporting the Development of Resilient Message Passing Applications
                  Using Simulation},
  booktitle    = {22nd Euromicro International Conference on Parallel, Distributed,
                  and Network-Based Processing, {PDP} 2014, Torino, Italy, February
                  12-14, 2014},
  pages        = {271--278},
  publisher    = {{IEEE} Computer Society},
  year         = {2014},
  url          = {https://doi.org/10.1109/PDP.2014.74},
  doi          = {10.1109/PDP.2014.74},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/pdp/NaughtonEVB14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@proceedings{DBLP:conf/sc/2014scala,
  editor       = {Vassil Alexandrov and
                  Al Geist and
                  Christian Engelmann},
  title        = {Proceedings of the 5th Workshop on Latest Advances in Scalable Algorithms
                  for Large-Scale Systems, ScalA '14, New Orleans, Louisiana, USA, November
                  16-21, 2014},
  publisher    = {{IEEE} Computer Society},
  year         = {2014},
  url          = {https://ieeexplore.ieee.org/xpl/conhome/7015710/proceeding},
  isbn         = {978-1-4799-7562-4},
  timestamp    = {Wed, 12 Jul 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/sc/2014scala.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/europar/NaughtonBEV13,
  author       = {Thomas J. Naughton and
                  Swen B{\"{o}}hm and
                  Christian Engelmann and
                  Geoffroy Vall{\'{e}}e},
  editor       = {Dieter an Mey and
                  Michael Alexander and
                  Paolo Bientinesi and
                  Mario Cannataro and
                  Carsten Clauss and
                  Alexandru Costan and
                  Gabor Kecskemeti and
                  Christine Morin and
                  Laura Ricci and
                  Julio Sahuquillo and
                  Martin Schulz and
                  Vittorio Scarano and
                  Stephen L. Scott and
                  Josef Weidendorfer},
  title        = {Using Performance Tools to Support Experiments in {HPC} Resilience},
  booktitle    = {Euro-Par 2013: Parallel Processing Workshops - BigDataCloud, DIHC,
                  FedICI, HeteroPar, HiBB, LSDVE, MHPC, OMHI, PADABS, PROPER, Resilience,
                  ROME, and {UCHPC} 2013, Aachen, Germany, August 26-27, 2013. Revised
                  Selected Papers},
  series       = {Lecture Notes in Computer Science},
  volume       = {8374},
  pages        = {727--736},
  publisher    = {Springer},
  year         = {2013},
  url          = {https://doi.org/10.1007/978-3-642-54420-0\_71},
  doi          = {10.1007/978-3-642-54420-0\_71},
  timestamp    = {Wed, 19 Feb 2020 14:52:57 +0100},
  biburl       = {https://dblp.org/rec/conf/europar/NaughtonBEV13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ic-nc/ValleeNBE13,
  author       = {Geoffroy Vall{\'{e}}e and
                  Thomas J. Naughton and
                  Swen B{\"{o}}hm and
                  Christian Engelmann},
  editor       = {Juan E. Guerrero},
  title        = {A Runtime Environment for Supporting Research in Resilient {HPC} System
                  Software {\&} Tools},
  booktitle    = {The First International Symposium on Computing and Networking - Across
                  Practical Development and Theoretical Research, Dogo {SPA} Resort,
                  Matsuyama, Japan, December 4-6, 2013},
  pages        = {213--219},
  publisher    = {{IEEE} Computer Society},
  year         = {2013},
  url          = {https://doi.org/10.1109/CANDAR.2013.38},
  doi          = {10.1109/CANDAR.2013.38},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/ic-nc/ValleeNBE13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icpp/EngelmannN13,
  author       = {Christian Engelmann and
                  Thomas J. Naughton},
  title        = {Toward a Performance/Resilience Tool for Hardware/Software Co-design
                  of High-Performance Computing Systems},
  booktitle    = {42nd International Conference on Parallel Processing, {ICPP} 2013,
                  Lyon, France, October 1-4, 2013},
  pages        = {960--969},
  publisher    = {{IEEE} Computer Society},
  year         = {2013},
  url          = {https://doi.org/10.1109/ICPP.2013.114},
  doi          = {10.1109/ICPP.2013.114},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icpp/EngelmannN13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ptw/LagadapatiME13,
  author       = {Mahesh Lagadapati and
                  Frank Mueller and
                  Christian Engelmann},
  editor       = {Andreas Kn{\"{u}}pfer and
                  Jos{\'{e}} Gracia and
                  Wolfgang E. Nagel and
                  Michael M. Resch},
  title        = {Tools for Simulation and Benchmark Generation at Exascale},
  booktitle    = {Tools for High Performance Computing 2013, Proceedings of the 7th
                  International Workshop on Parallel Tools for High Performance Computing,
                  September 2013, ZIH, Dresden, Germany},
  pages        = {19--24},
  publisher    = {Springer},
  year         = {2013},
  url          = {https://doi.org/10.1007/978-3-319-08144-1\_2},
  doi          = {10.1007/978-3-319-08144-1\_2},
  timestamp    = {Tue, 23 Mar 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/ptw/LagadapatiME13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/jpdc/WangMES12,
  author       = {Chao Wang and
                  Frank Mueller and
                  Christian Engelmann and
                  Stephen L. Scott},
  title        = {Proactive process-level live migration and back migration in {HPC}
                  environments},
  journal      = {J. Parallel Distributed Comput.},
  volume       = {72},
  number       = {2},
  pages        = {254--267},
  year         = {2012},
  url          = {https://doi.org/10.1016/j.jpdc.2011.10.009},
  doi          = {10.1016/J.JPDC.2011.10.009},
  timestamp    = {Tue, 23 Mar 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/jpdc/WangMES12.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icdcs/ElliottKFMFE12,
  author       = {James Elliott and
                  Kishor Kharbas and
                  David Fiala and
                  Frank Mueller and
                  Kurt B. Ferreira and
                  Christian Engelmann},
  title        = {Combining Partial Redundancy and Checkpointing for {HPC}},
  booktitle    = {2012 {IEEE} 32nd International Conference on Distributed Computing
                  Systems, Macau, China, June 18-21, 2012},
  pages        = {615--626},
  publisher    = {{IEEE} Computer Society},
  year         = {2012},
  url          = {https://doi.org/10.1109/ICDCS.2012.56},
  doi          = {10.1109/ICDCS.2012.56},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icdcs/ElliottKFMFE12.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ipps/WangVMMKE12,
  author       = {Chao Wang and
                  Sudharshan S. Vazhkudai and
                  Xiaosong Ma and
                  Fei Meng and
                  Youngjae Kim and
                  Christian Engelmann},
  title        = {NVMalloc: Exposing an Aggregate {SSD} Store as a Memory Partition
                  in Extreme-Scale Machines},
  booktitle    = {26th {IEEE} International Parallel and Distributed Processing Symposium,
                  {IPDPS} 2012, Shanghai, China, May 21-25, 2012},
  pages        = {957--968},
  publisher    = {{IEEE} Computer Society},
  year         = {2012},
  url          = {https://doi.org/10.1109/IPDPS.2012.90},
  doi          = {10.1109/IPDPS.2012.90},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/ipps/WangVMMKE12.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/pdp/BohmE12,
  author       = {Swen B{\"{o}}hm and
                  Christian Engelmann},
  editor       = {Rainer Stotzka and
                  Michael Schiffers and
                  Yannis Cotronis},
  title        = {File {I/O} for {MPI} Applications in Redundant Execution Scenarios},
  booktitle    = {Proceedings of the 20th Euromicro International Conference on Parallel,
                  Distributed and Network-Based Processing, {PDP} 2012, Munich, Germany,
                  February 15-17, 2012},
  pages        = {112--119},
  publisher    = {{IEEE}},
  year         = {2012},
  url          = {https://doi.org/10.1109/PDP.2012.22},
  doi          = {10.1109/PDP.2012.22},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/pdp/BohmE12.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sc/FialaMERFB12,
  author       = {David Fiala and
                  Frank Mueller and
                  Christian Engelmann and
                  Rolf Riesen and
                  Kurt B. Ferreira and
                  Ron Brightwell},
  editor       = {Jeffrey K. Hollingsworth},
  title        = {Detection and correction of silent data corruption for large-scale
                  high-performance computing},
  booktitle    = {{SC} Conference on High Performance Computing Networking, Storage
                  and Analysis, {SC} '12, Salt Lake City, UT, {USA} - November 11 -
                  15, 2012},
  pages        = {78},
  publisher    = {{IEEE/ACM}},
  year         = {2012},
  url          = {https://doi.org/10.1109/SC.2012.49},
  doi          = {10.1109/SC.2012.49},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/sc/FialaMERFB12.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/europar/NaughtonVES11,
  author       = {Thomas J. Naughton and
                  Geoffroy Vall{\'{e}}e and
                  Christian Engelmann and
                  Stephen L. Scott},
  editor       = {Michael Alexander and
                  Pasqua D'Ambra and
                  Adam Belloum and
                  George Bosilca and
                  Mario Cannataro and
                  Marco Danelutto and
                  Beniamino Di Martino and
                  Michael Gerndt and
                  Emmanuel Jeannot and
                  Raymond Namyst and
                  Jean Roman and
                  Stephen L. Scott and
                  Jesper Larsson Tr{\"{a}}ff and
                  Geoffroy Vall{\'{e}}e and
                  Josef Weidendorfer},
  title        = {A Case for Virtual Machine Based Fault Injection in a High-Performance
                  Computing Environment},
  booktitle    = {Euro-Par 2011: Parallel Processing Workshops - CCPI, CGWS, HeteroPar,
                  HiBB, HPCVirt, HPPC, HPSS, MDGS, ProPer, Resilience, UCHPC, VHPC,
                  Bordeaux, France, August 29 - September 2, 2011, Revised Selected
                  Papers, Part {I}},
  series       = {Lecture Notes in Computer Science},
  volume       = {7155},
  pages        = {234--243},
  publisher    = {Springer},
  year         = {2011},
  url          = {https://doi.org/10.1007/978-3-642-29737-3\_27},
  doi          = {10.1007/978-3-642-29737-3\_27},
  timestamp    = {Wed, 19 Feb 2020 14:52:57 +0100},
  biburl       = {https://dblp.org/rec/conf/europar/NaughtonVES11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/europar/FialaFME11,
  author       = {David Fiala and
                  Kurt B. Ferreira and
                  Frank Mueller and
                  Christian Engelmann},
  editor       = {Michael Alexander and
                  Pasqua D'Ambra and
                  Adam Belloum and
                  George Bosilca and
                  Mario Cannataro and
                  Marco Danelutto and
                  Beniamino Di Martino and
                  Michael Gerndt and
                  Emmanuel Jeannot and
                  Raymond Namyst and
                  Jean Roman and
                  Stephen L. Scott and
                  Jesper Larsson Tr{\"{a}}ff and
                  Geoffroy Vall{\'{e}}e and
                  Josef Weidendorfer},
  title        = {A Tunable, Software-Based {DRAM} Error Detection and Correction Library
                  for {HPC}},
  booktitle    = {Euro-Par 2011: Parallel Processing Workshops - CCPI, CGWS, HeteroPar,
                  HiBB, HPCVirt, HPPC, HPSS, MDGS, ProPer, Resilience, UCHPC, VHPC,
                  Bordeaux, France, August 29 - September 2, 2011, Revised Selected
                  Papers, Part {II}},
  series       = {Lecture Notes in Computer Science},
  volume       = {7156},
  pages        = {251--261},
  publisher    = {Springer},
  year         = {2011},
  url          = {https://doi.org/10.1007/978-3-642-29740-3\_29},
  doi          = {10.1007/978-3-642-29740-3\_29},
  timestamp    = {Mon, 22 Mar 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/europar/FialaFME11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icppw/JonesE11,
  author       = {Ian S. Jones and
                  Christian Engelmann},
  editor       = {Jang{-}Ping Sheu and
                  Cho{-}Li Wang},
  title        = {Simulation of Large-Scale {HPC} Architectures},
  booktitle    = {2011 International Conference on Parallel Processing Workshops, {ICPPW}
                  2011, Taipei, Taiwan, Sept. 13-16, 2011},
  pages        = {447--456},
  publisher    = {{IEEE} Computer Society},
  year         = {2011},
  url          = {https://doi.org/10.1109/ICPPW.2011.44},
  doi          = {10.1109/ICPPW.2011.44},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icppw/JonesE11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ieeehpcs/BohmE11,
  author       = {Swen B{\"{o}}hm and
                  Christian Engelmann},
  editor       = {Waleed W. Smari and
                  John P. McIntire},
  title        = {xSim: The extreme-scale simulator},
  booktitle    = {2011 International Conference on High Performance Computing {\&}
                  Simulation, {HPCS} 2012, Istanbul, Turkey, July 4-8, 2011},
  pages        = {280--286},
  publisher    = {{IEEE}},
  year         = {2011},
  url          = {https://doi.org/10.1109/HPCSim.2011.5999835},
  doi          = {10.1109/HPCSIM.2011.5999835},
  timestamp    = {Tue, 28 Jul 2020 13:09:02 +0200},
  biburl       = {https://dblp.org/rec/conf/ieeehpcs/BohmE11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sc/FialaMERF11,
  author       = {David Fiala and
                  Frank Mueller and
                  Christian Engelmann and
                  Rolf Riesen and
                  Kurt B. Ferreira},
  editor       = {Scott A. Lathrop and
                  Jim Costa and
                  William Kramer},
  title        = {Poster: detection and correction of silent data corruption for large-scale
                  high-performance computing},
  booktitle    = {Conference on High Performance Computing Networking, Storage and Analysis
                  - Companion Volume, {SC} 2011, Seattle, WA, USA, November 12-18, 2011},
  pages        = {47--48},
  publisher    = {{ACM}},
  year         = {2011},
  url          = {https://doi.org/10.1145/2148600.2148625},
  doi          = {10.1145/2148600.2148625},
  timestamp    = {Mon, 22 Mar 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/sc/FialaMERF11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sc/FialaFME11,
  author       = {David Fiala and
                  Kurt B. Ferreira and
                  Frank Mueller and
                  Christian Engelmann},
  editor       = {Scott A. Lathrop and
                  Jim Costa and
                  William Kramer},
  title        = {Poster: a tunable, software-based {DRAM} error detection and correction
                  library for {HPC}},
  booktitle    = {Conference on High Performance Computing Networking, Storage and Analysis
                  - Companion Volume, {SC} 2011, Seattle, WA, USA, November 12-18, 2011},
  pages        = {49--50},
  publisher    = {{ACM}},
  year         = {2011},
  url          = {https://doi.org/10.1145/2148600.2148626},
  doi          = {10.1145/2148600.2148626},
  timestamp    = {Mon, 22 Mar 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/sc/FialaFME11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/fgcs/ScottVNTEO10,
  author       = {Stephen L. Scott and
                  Geoffroy Vall{\'{e}}e and
                  Thomas J. Naughton and
                  Anand Tikotekar and
                  Christian Engelmann and
                  Hong Ong},
  title        = {System-level virtualization research at Oak Ridge National Laboratory},
  journal      = {Future Gener. Comput. Syst.},
  volume       = {26},
  number       = {3},
  pages        = {304--307},
  year         = {2010},
  url          = {https://doi.org/10.1016/j.future.2009.07.001},
  doi          = {10.1016/J.FUTURE.2009.07.001},
  timestamp    = {Wed, 19 Feb 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/fgcs/ScottVNTEO10.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/hpcc/BohmES10,
  author       = {Swen B{\"{o}}hm and
                  Christian Engelmann and
                  Stephen L. Scott},
  title        = {Aggregation of Real-Time System Monitoring Data for Analyzing Large-Scale
                  Parallel and Distributed Computing Environments},
  booktitle    = {12th {IEEE} International Conference on High Performance Computing
                  and Communications, {HPCC} 2010, 1-3 September 2010, Melbourne, Australia},
  pages        = {72--78},
  publisher    = {{IEEE}},
  year         = {2010},
  url          = {https://doi.org/10.1109/HPCC.2010.32},
  doi          = {10.1109/HPCC.2010.32},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/hpcc/BohmES10.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icpads/WangMES10,
  author       = {Chao Wang and
                  Frank Mueller and
                  Christian Engelmann and
                  Stephen L. Scott},
  title        = {Hybrid Checkpointing for {MPI} Jobs in {HPC} Environments},
  booktitle    = {16th {IEEE} International Conference on Parallel and Distributed Systems,
                  {ICPADS} 2010, Shanghai, China, December 8-10, 2010},
  pages        = {524--533},
  publisher    = {{IEEE} Computer Society},
  year         = {2010},
  url          = {https://doi.org/10.1109/ICPADS.2010.48},
  doi          = {10.1109/ICPADS.2010.48},
  timestamp    = {Thu, 23 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icpads/WangMES10.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sc/LiVBMMKES10,
  author       = {Min Li and
                  Sudharshan S. Vazhkudai and
                  Ali Raza Butt and
                  Fei Meng and
                  Xiaosong Ma and
                  Youngjae Kim and
                  Christian Engelmann and
                  Galen M. Shipman},
  title        = {Functional Partitioning to Optimize End-to-End Performance on Many-core
                  Architectures},
  booktitle    = {Conference on High Performance Computing Networking, Storage and Analysis,
                  {SC} 2010, New Orleans, LA, USA, November 13-19, 2010},
  pages        = {1--12},
  publisher    = {{IEEE}},
  year         = {2010},
  url          = {https://doi.org/10.1109/SC.2010.28},
  doi          = {10.1109/SC.2010.28},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/sc/LiVBMMKES10.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/jpdc/HeOECS09,
  author       = {Xubin He and
                  Li Ou and
                  Christian Engelmann and
                  Xin Chen and
                  Stephen L. Scott},
  title        = {Symmetric active/active metadata service for high availability parallel
                  file systems},
  journal      = {J. Parallel Distributed Comput.},
  volume       = {69},
  number       = {12},
  pages        = {961--973},
  year         = {2009},
  url          = {https://doi.org/10.1016/j.jpdc.2009.08.004},
  doi          = {10.1016/J.JPDC.2009.08.004},
  timestamp    = {Sun, 22 Oct 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/jpdc/HeOECS09.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/IEEEares/TaeratNCELOSE09,
  author       = {Narate Taerat and
                  Nichamon Naksinehaboon and
                  Clayton Chandler and
                  James Elliott and
                  Chokchai Leangsuksun and
                  George Ostrouchov and
                  Stephen L. Scott and
                  Christian Engelmann},
  title        = {Blue Gene/L Log Analysis and Time to Interrupt Estimation},
  booktitle    = {Proceedings of the The Forth International Conference on Availability,
                  Reliability and Security, {ARES} 2009, March 16-19, 2009, Fukuoka,
                  Japan},
  pages        = {173--180},
  publisher    = {{IEEE} Computer Society},
  year         = {2009},
  url          = {https://doi.org/10.1109/ARES.2009.105},
  doi          = {10.1109/ARES.2009.105},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/IEEEares/TaeratNCELOSE09.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eurosys/TikotekarOAVNES09,
  author       = {Anand Tikotekar and
                  Hong Ong and
                  Sadaf R. Alam and
                  Geoffroy Vall{\'{e}}e and
                  Thomas J. Naughton and
                  Christian Engelmann and
                  Stephen L. Scott},
  editor       = {Stephen L. Scott and
                  Geoffroy Vall{\'{e}}e},
  title        = {Performance comparison of two virtual machine scenarios using an {HPC}
                  application: a case study using molecular dynamics simulations},
  booktitle    = {Proceedings of the 3rd {ACM} Workshop on System-level Virtualization
                  for High Performance Computing, HPCVirt '09, Nuremburg, Germany, March
                  31, 2009},
  pages        = {33--40},
  publisher    = {{ACM}},
  year         = {2009},
  url          = {https://doi.org/10.1145/1519138.1519143},
  doi          = {10.1145/1519138.1519143},
  timestamp    = {Sat, 09 Apr 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/eurosys/TikotekarOAVNES09.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/pdp/ValentiniBBPPE09,
  author       = {Alessandro Valentini and
                  Christian Di Biagio and
                  Fabrizio Batino and
                  Guido Pennella and
                  Fabrizio Palma and
                  Christian Engelmann},
  editor       = {Didier El Baz and
                  Fran{\c{c}}ois Spies and
                  Tom Gross},
  title        = {High Performance Computing with Harness over InfiniBand},
  booktitle    = {Proceedings of the 17th Euromicro International Conference on Parallel,
                  Distributed and Network-Based Processing, {PDP} 2009, Weimar, Germany,
                  18-20 Febuary 2009},
  pages        = {151--154},
  publisher    = {{IEEE} Computer Society},
  year         = {2009},
  url          = {https://doi.org/10.1109/PDP.2009.64},
  doi          = {10.1109/PDP.2009.64},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/pdp/ValentiniBBPPE09.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/pdp/EngelmannVNS09,
  author       = {Christian Engelmann and
                  Geoffroy Vall{\'{e}}e and
                  Thomas J. Naughton and
                  Stephen L. Scott},
  editor       = {Didier El Baz and
                  Fran{\c{c}}ois Spies and
                  Tom Gross},
  title        = {Proactive Fault Tolerance Using Preemptive Migration},
  booktitle    = {Proceedings of the 17th Euromicro International Conference on Parallel,
                  Distributed and Network-Based Processing, {PDP} 2009, Weimar, Germany,
                  18-20 Febuary 2009},
  pages        = {252--257},
  publisher    = {{IEEE} Computer Society},
  year         = {2009},
  url          = {https://doi.org/10.1109/PDP.2009.31},
  doi          = {10.1109/PDP.2009.31},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/pdp/EngelmannVNS09.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ppopp/ScottEVNTOLNNPMWNV09,
  author       = {Stephen L. Scott and
                  Christian Engelmann and
                  Geoffroy Vall{\'{e}}e and
                  Thomas J. Naughton and
                  Anand Tikotekar and
                  George Ostrouchov and
                  Chokchai Leangsuksun and
                  Nichamon Naksinehaboon and
                  Raja Nassar and
                  Mihaela Paun and
                  Frank Mueller and
                  Chao Wang and
                  Arun Babu Nagarajan and
                  Jyothish Varma},
  editor       = {Daniel A. Reed and
                  Vivek Sarkar},
  title        = {A tunable holistic resiliency approach for high-performance computing
                  systems},
  booktitle    = {Proceedings of the 14th {ACM} {SIGPLAN} Symposium on Principles and
                  Practice of Parallel Programming, {PPOPP} 2009, Raleigh, NC, USA,
                  February 14-18, 2009},
  pages        = {305--306},
  publisher    = {{ACM}},
  year         = {2009},
  url          = {https://doi.org/10.1145/1504176.1504227},
  doi          = {10.1145/1504176.1504227},
  timestamp    = {Sun, 12 Jun 2022 19:46:08 +0200},
  biburl       = {https://dblp.org/rec/conf/ppopp/ScottEVNTOLNNPMWNV09.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/IEEEares/EngelmannSLH08,
  author       = {Christian Engelmann and
                  Stephen L. Scott and
                  Chokchai Leangsuksun and
                  Xubin He},
  title        = {Symmetric Active/Active Replication for Dependent Services},
  booktitle    = {Proceedings of the The Third International Conference on Availability,
                  Reliability and Security, {ARES} 2008, March 4-7, 2008, Technical
                  University of Catalonia, Barcelona , Spain},
  pages        = {260--267},
  publisher    = {{IEEE} Computer Society},
  year         = {2008},
  url          = {https://doi.org/10.1109/ARES.2008.64},
  doi          = {10.1109/ARES.2008.64},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/IEEEares/EngelmannSLH08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/IEEEares/ValleeCETLNS08,
  author       = {Geoffroy Vall{\'{e}}e and
                  Kulathep Charoenpornwattana and
                  Christian Engelmann and
                  Anand Tikotekar and
                  Chokchai Leangsuksun and
                  Thomas J. Naughton and
                  Stephen L. Scott},
  title        = {A Framework for Proactive Fault Tolerance},
  booktitle    = {Proceedings of the The Third International Conference on Availability,
                  Reliability and Security, {ARES} 2008, March 4-7, 2008, Technical
                  University of Catalonia, Barcelona , Spain},
  pages        = {659--664},
  publisher    = {{IEEE} Computer Society},
  year         = {2008},
  url          = {https://doi.org/10.1109/ARES.2008.171},
  doi          = {10.1109/ARES.2008.171},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/IEEEares/ValleeCETLNS08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ccgrid/EngelmannSLH08,
  author       = {Christian Engelmann and
                  Stephen L. Scott and
                  Chokchai Leangsuksun and
                  Xubin He},
  title        = {Symmetric Active/Active High Availability for High-Performance Computing
                  System Services: Accomplishments and Limitations},
  booktitle    = {8th {IEEE} International Symposium on Cluster Computing and the Grid
                  (CCGrid 2008), 19-22 May 2008, Lyon, France},
  pages        = {813--818},
  publisher    = {{IEEE} Computer Society},
  year         = {2008},
  url          = {https://doi.org/10.1109/CCGRID.2008.78},
  doi          = {10.1109/CCGRID.2008.78},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/ccgrid/EngelmannSLH08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/europar/TikotekarVNOES08,
  author       = {Anand Tikotekar and
                  Geoffroy Vall{\'{e}}e and
                  Thomas J. Naughton and
                  Hong Ong and
                  Christian Engelmann and
                  Stephen L. Scott},
  editor       = {Eduardo C{\'{e}}sar and
                  Michael Alexander and
                  Achim Streit and
                  Jesper Larsson Tr{\"{a}}ff and
                  Christophe C{\'{e}}rin and
                  Andreas Kn{\"{u}}pfer and
                  Dieter Kranzlm{\"{u}}ller and
                  Shantenu Jha},
  title        = {An Analysis of {HPC} Benchmarks in Virtual Machine Environments},
  booktitle    = {Euro-Par 2008 Workshops - Parallel Processing, {VHPC} 2008, {UNICORE}
                  2008, {HPPC} 2008, {SGS} 2008, {PROPER} 2008, {ROIA} 2008, and {DPA}
                  2008, Las Palmas de Gran Canaria, Spain, August 25-26, 2008, Revised
                  Selected Papers},
  series       = {Lecture Notes in Computer Science},
  volume       = {5415},
  pages        = {63--71},
  publisher    = {Springer},
  year         = {2008},
  url          = {https://doi.org/10.1007/978-3-642-00955-6\_8},
  doi          = {10.1007/978-3-642-00955-6\_8},
  timestamp    = {Tue, 07 Jan 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/europar/TikotekarVNOES08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eurosys/TikotekarVNOESF08,
  author       = {Anand Tikotekar and
                  Geoffroy Vall{\'{e}}e and
                  Thomas J. Naughton and
                  Hong Ong and
                  Christian Engelmann and
                  Stephen L. Scott and
                  Anthony M. Filippi},
  editor       = {Stephen L. Scott and
                  Geoffroy Vall{\'{e}}e},
  title        = {Effects of virtualization on a scientific application running a hyperspectral
                  radiative transfer code on virtual machines},
  booktitle    = {Proceedings of the 2nd Workshop on System-Level Virtualization for
                  High Performance Computing, HPCVirt '08, Glasgow, Scotland, USA, March
                  31, 2008},
  pages        = {16--23},
  publisher    = {{ACM}},
  year         = {2008},
  url          = {https://doi.org/10.1145/1435452.1435455},
  doi          = {10.1145/1435452.1435455},
  timestamp    = {Tue, 07 Nov 2023 11:31:29 +0100},
  biburl       = {https://dblp.org/rec/conf/eurosys/TikotekarVNOESF08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/pdp/KonningESG08,
  author       = {Bj{\"{o}}rn K{\"{o}}nning and
                  Christian Engelmann and
                  Stephen L. Scott and
                  Al Geist},
  title        = {Virtualized Environments for the Harness High Performance Computing
                  Workbench},
  booktitle    = {16th Euromicro International Conference on Parallel, Distributed and
                  Network-Based Processing {(PDP} 2008), 13-15 February 2008, Toulouse,
                  France},
  pages        = {133--140},
  publisher    = {{IEEE} Computer Society},
  year         = {2008},
  url          = {https://doi.org/10.1109/PDP.2008.14},
  doi          = {10.1109/PDP.2008.14},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/pdp/KonningESG08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/pdp/ValleeNEOS08,
  author       = {Geoffroy Vall{\'{e}}e and
                  Thomas J. Naughton and
                  Christian Engelmann and
                  Hong Ong and
                  Stephen L. Scott},
  title        = {System-Level Virtualization for High Performance Computing},
  booktitle    = {16th Euromicro International Conference on Parallel, Distributed and
                  Network-Based Processing {(PDP} 2008), 13-15 February 2008, Toulouse,
                  France},
  pages        = {636--643},
  publisher    = {{IEEE} Computer Society},
  year         = {2008},
  url          = {https://doi.org/10.1109/PDP.2008.85},
  doi          = {10.1109/PDP.2008.85},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/pdp/ValleeNEOS08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sc/WangMES08,
  author       = {Chao Wang and
                  Frank Mueller and
                  Christian Engelmann and
                  Stephen L. Scott},
  title        = {Proactive process-level live migration in {HPC} environments},
  booktitle    = {Proceedings of the {ACM/IEEE} Conference on High Performance Computing,
                  {SC} 2008, November 15-21, 2008, Austin, Texas, {USA}},
  pages        = {43},
  publisher    = {{IEEE/ACM}},
  year         = {2008},
  url          = {https://doi.org/10.1109/SC.2008.5222634},
  doi          = {10.1109/SC.2008.5222634},
  timestamp    = {Tue, 23 Mar 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/sc/WangMES08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/svm2/ValleeNOTEBAS08,
  author       = {Geoffroy Vall{\'{e}}e and
                  Thomas J. Naughton and
                  Hong Ong and
                  Anand Tikotekar and
                  Christian Engelmann and
                  Wesley Bland and
                  Ferrol Aderholdt and
                  Stephen L. Scott},
  editor       = {Latifa Boursas and
                  Mark Carlson and
                  Wolfgang Hommel and
                  Michelle Sibilla and
                  Kes Wold},
  title        = {Virtual System Environments},
  booktitle    = {Systems and Virtualization Management. Standards and New Technologies
                  - Second International Workshop, {SVM} 2008, Munich, Germany, October,
                  21-22, 2008 Proceedings},
  series       = {Communications in Computer and Information Science},
  volume       = {18},
  pages        = {72--83},
  year         = {2008},
  url          = {https://doi.org/10.1007/978-3-540-88708-9\_7},
  doi          = {10.1007/978-3-540-88708-9\_7},
  timestamp    = {Tue, 07 Jan 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/svm2/ValleeNOTEBAS08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/ijhpcn/HeOKSE07,
  author       = {Xubin (Ben) He and
                  Li Ou and
                  Martha J. Kosa and
                  Stephen L. Scott and
                  Christian Engelmann},
  title        = {A unified multiple-level cache for high performance storage systems},
  journal      = {Int. J. High Perform. Comput. Netw.},
  volume       = {5},
  number       = {1/2},
  pages        = {97--109},
  year         = {2007},
  url          = {https://doi.org/10.1504/IJHPCN.2007.015768},
  doi          = {10.1504/IJHPCN.2007.015768},
  timestamp    = {Thu, 09 Jul 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/ijhpcn/HeOKSE07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/IEEEares/EngelmannSLH07,
  author       = {Christian Engelmann and
                  Stephen L. Scott and
                  Chokchai Leangsuksun and
                  Xubin He},
  title        = {On Programming Models for Service-Level High Availability},
  booktitle    = {Proceedings of the The Second International Conference on Availability,
                  Reliability and Security, {ARES} 2007, The International Dependability
                  Conference - Bridging Theory and Practice, April 10-13 2007, Vienna,
                  Austria},
  pages        = {999--1008},
  publisher    = {{IEEE} Computer Society},
  year         = {2007},
  url          = {https://doi.org/10.1109/ARES.2007.109},
  doi          = {10.1109/ARES.2007.109},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/IEEEares/EngelmannSLH07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ccgrid/EngelmannSLH07,
  author       = {Christian Engelmann and
                  Stephen L. Scott and
                  Chokchai Leangsuksun and
                  Xubin He},
  title        = {Transparent Symmetric Active/Active Replication for Service-Level
                  High Availability},
  booktitle    = {Seventh {IEEE} International Symposium on Cluster Computing and the
                  Grid (CCGrid 2007), 14-17 May 2007, Rio de Janeiro, Brazil},
  pages        = {755--760},
  publisher    = {{IEEE} Computer Society},
  year         = {2007},
  url          = {https://doi.org/10.1109/CCGRID.2007.116},
  doi          = {10.1109/CCGRID.2007.116},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/ccgrid/EngelmannSLH07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccS/EngelmannOS07,
  author       = {Christian Engelmann and
                  Hong Ong and
                  Stephen L. Scott},
  editor       = {Yong Shi and
                  G. Dick van Albada and
                  Jack J. Dongarra and
                  Peter M. A. Sloot},
  title        = {Middleware in Modern High Performance Computing System Architectures},
  booktitle    = {Computational Science - {ICCS} 2007, 7th International Conference,
                  Beijing, China, May 27 - 30, 2007, Proceedings, Part {II}},
  series       = {Lecture Notes in Computer Science},
  volume       = {4488},
  pages        = {784--791},
  publisher    = {Springer},
  year         = {2007},
  url          = {https://doi.org/10.1007/978-3-540-72586-2\_111},
  doi          = {10.1007/978-3-540-72586-2\_111},
  timestamp    = {Tue, 08 Nov 2022 08:34:34 +0100},
  biburl       = {https://dblp.org/rec/conf/iccS/EngelmannOS07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icccn/OuHES07,
  author       = {Li Ou and
                  Xubin He and
                  Christian Engelmann and
                  Stephen L. Scott},
  title        = {A Fast Delivery Protocol for Total Order Broadcasting},
  booktitle    = {Proceedings of the 16th International Conference on Computer Communications
                  and Networks, {IEEE} {ICCCN} 2007, Turtle Bay Resort, Honolulu, Hawaii,
                  USA, August 13-16, 2007},
  pages        = {730--734},
  publisher    = {{IEEE}},
  year         = {2007},
  url          = {https://doi.org/10.1109/ICCCN.2007.4317904},
  doi          = {10.1109/ICCCN.2007.4317904},
  timestamp    = {Wed, 16 Oct 2019 14:14:49 +0200},
  biburl       = {https://dblp.org/rec/conf/icccn/OuHES07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ics/NagarajanMES07,
  author       = {Arun Babu Nagarajan and
                  Frank Mueller and
                  Christian Engelmann and
                  Stephen L. Scott},
  editor       = {Burton J. Smith},
  title        = {Proactive fault tolerance for {HPC} with Xen virtualization},
  booktitle    = {Proceedings of the 21th Annual International Conference on Supercomputing,
                  {ICS} 2007, Seattle, Washington, USA, June 17-21, 2007},
  pages        = {23--32},
  publisher    = {{ACM}},
  year         = {2007},
  url          = {https://doi.org/10.1145/1274971.1274978},
  doi          = {10.1145/1274971.1274978},
  timestamp    = {Tue, 23 Mar 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/ics/NagarajanMES07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ipps/WangMES07,
  author       = {Chao Wang and
                  Frank Mueller and
                  Christian Engelmann and
                  Stephen L. Scott},
  title        = {A Job Pause Service under {LAM/MPI+BLCR} for Transparent Fault Tolerance},
  booktitle    = {21th International Parallel and Distributed Processing Symposium {(IPDPS}
                  2007), Proceedings, 26-30 March 2007, Long Beach, California, {USA}},
  pages        = {1--10},
  publisher    = {{IEEE}},
  year         = {2007},
  url          = {https://doi.org/10.1109/IPDPS.2007.370307},
  doi          = {10.1109/IPDPS.2007.370307},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/ipps/WangMES07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/pvm/SaverioCBPE07,
  author       = {Emanuele Di Saverio and
                  Marco Cesati and
                  Christian Di Biagio and
                  Guido Pennella and
                  Christian Engelmann},
  editor       = {Franck Cappello and
                  Thomas H{\'{e}}rault and
                  Jack J. Dongarra},
  title        = {Distributed Real-Time Computing with Harness},
  booktitle    = {Recent Advances in Parallel Virtual Machine and Message Passing Interface,
                  14th European {PVM/MPI} User's Group Meeting, Paris, France, September
                  30 - October 3, 2007, Proceedings},
  series       = {Lecture Notes in Computer Science},
  volume       = {4757},
  pages        = {281--288},
  publisher    = {Springer},
  year         = {2007},
  url          = {https://doi.org/10.1007/978-3-540-75416-9\_39},
  doi          = {10.1007/978-3-540-75416-9\_39},
  timestamp    = {Tue, 07 May 2024 20:11:00 +0200},
  biburl       = {https://dblp.org/rec/conf/pvm/SaverioCBPE07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/jcp/EngelmannSLH06,
  author       = {Christian Engelmann and
                  Stephen L. Scott and
                  Chokchai Leangsuksun and
                  Xubin (Ben) He},
  title        = {Symmetric Active/Active High Availability for High-Performance Computing
                  System Services},
  journal      = {J. Comput.},
  volume       = {1},
  number       = {8},
  pages        = {43--54},
  year         = {2006},
  url          = {http://www.jcomputers.us/index.php?m=content\&\#38;c=index\&\#38;a=show\&\#38;catid=95\&\#38;id=1222},
  doi          = {10.4304/JCP.1.8.43-54},
  timestamp    = {Thu, 25 Feb 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/jcp/EngelmannSLH06.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/sigops/EngelmannSBGLVWMSS06,
  author       = {Christian Engelmann and
                  Stephen L. Scott and
                  David E. Bernholdt and
                  Narasimha Raju Gottumukkala and
                  Chokchai Leangsuksun and
                  Jyothish Varma and
                  Chao Wang and
                  Frank Mueller and
                  Aniruddha G. Shet and
                  P. Sadayappan},
  title        = {{MOLAR:} adaptive runtime support for high-end computing operating
                  and runtime systems},
  journal      = {{ACM} {SIGOPS} Oper. Syst. Rev.},
  volume       = {40},
  number       = {2},
  pages        = {63--72},
  year         = {2006},
  url          = {https://doi.org/10.1145/1131322.1131337},
  doi          = {10.1145/1131322.1131337},
  timestamp    = {Wed, 07 Dec 2022 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/sigops/EngelmannSBGLVWMSS06.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/IEEEares/EngelmannSLH06,
  author       = {Christian Engelmann and
                  Stephen L. Scott and
                  Chokchai Leangsuksun and
                  Xubin (Ben) He},
  title        = {Active/Active Replication for Highly Available {HPC} System Services},
  booktitle    = {Proceedings of the The First International Conference on Availability,
                  Reliability and Security, {ARES} 2006, The International Dependability
                  Conference - Bridging Theory and Practice, April 20-22 2006, Vienna
                  University of Technology, Austria},
  pages        = {639--645},
  publisher    = {{IEEE} Computer Society},
  year         = {2006},
  url          = {https://doi.org/10.1109/ARES.2006.23},
  doi          = {10.1109/ARES.2006.23},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/IEEEares/EngelmannSLH06.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cluster/UhlemannES06,
  author       = {Kai Uhlemann and
                  Christian Engelmann and
                  Stephen L. Scott},
  title        = {{JOSHUA:} Symmetric Active/Active Replication for Highly Available
                  {HPC} Job and Resource Management},
  booktitle    = {Proceedings of the 2006 {IEEE} International Conference on Cluster
                  Computing, September 25-28, 2006, Barcelona, Spain},
  publisher    = {{IEEE} Computer Society},
  year         = {2006},
  url          = {https://doi.org/10.1109/CLUSTR.2006.311855},
  doi          = {10.1109/CLUSTR.2006.311855},
  timestamp    = {Thu, 23 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cluster/UhlemannES06.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/hpcc/BaumannEG06,
  author       = {Ronald Baumann and
                  Christian Engelmann and
                  Al Geist},
  editor       = {Michael Gerndt and
                  Dieter Kranzlm{\"{u}}ller},
  title        = {A Parallel Plug-In Programming Paradigm},
  booktitle    = {High Performance Computing and Communications, Second International
                  Conference, {HPCC} 2006, Munich, Germany, September 13-15, 2006, Proceedings},
  series       = {Lecture Notes in Computer Science},
  volume       = {4208},
  pages        = {823--832},
  publisher    = {Springer},
  year         = {2006},
  url          = {https://doi.org/10.1007/11847366\_85},
  doi          = {10.1007/11847366\_85},
  timestamp    = {Tue, 14 May 2019 10:00:45 +0200},
  biburl       = {https://dblp.org/rec/conf/hpcc/BaumannEG06.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccS/EngelmannG06,
  author       = {Christian Engelmann and
                  Al Geist},
  editor       = {Vassil N. Alexandrov and
                  G. Dick van Albada and
                  Peter M. A. Sloot and
                  Jack J. Dongarra},
  title        = {{RMIX:} {A} Dynamic, Heterogeneous, Reconfigurable Communication Framework},
  booktitle    = {Computational Science - {ICCS} 2006, 6th International Conference,
                  Reading, UK, May 28-31, 2006, Proceedings, Part {II}},
  series       = {Lecture Notes in Computer Science},
  volume       = {3992},
  pages        = {573--580},
  publisher    = {Springer},
  year         = {2006},
  url          = {https://doi.org/10.1007/11758525\_77},
  doi          = {10.1007/11758525\_77},
  timestamp    = {Tue, 14 May 2019 10:00:48 +0200},
  biburl       = {https://dblp.org/rec/conf/iccS/EngelmannG06.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ics/VarmaWMES06,
  author       = {Jyothish Varma and
                  Chao Wang and
                  Frank Mueller and
                  Christian Engelmann and
                  Stephen L. Scott},
  editor       = {Gregory K. Egan and
                  Yoichi Muraoka},
  title        = {Scalable, fault tolerant membership for {MPI} tasks on {HPC} systems},
  booktitle    = {Proceedings of the 20th Annual International Conference on Supercomputing,
                  {ICS} 2006, Cairns, Queensland, Australia, June 28 - July 01, 2006},
  pages        = {219--228},
  publisher    = {{ACM}},
  year         = {2006},
  url          = {https://doi.org/10.1145/1183401.1183433},
  doi          = {10.1145/1183401.1183433},
  timestamp    = {Tue, 23 Mar 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/ics/VarmaWMES06.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cluster/LimayeLGSELC05,
  author       = {Kshitij Limaye and
                  Box Leangsuksun and
                  Zeno Greenwood and
                  Stephen L. Scott and
                  Christian Engelmann and
                  Richard Libby and
                  Kasidit Chanchio},
  title        = {Job-Site Level Fault Tolerance for Cluster and Grid environments},
  booktitle    = {2005 {IEEE} International Conference on Cluster Computing {(CLUSTER}
                  2005), September 26 - 30, 2005, Boston, Massachusetts, {USA}},
  pages        = {1--9},
  publisher    = {{IEEE} Computer Society},
  year         = {2005},
  url          = {https://doi.org/10.1109/CLUSTR.2005.347043},
  doi          = {10.1109/CLUSTR.2005.347043},
  timestamp    = {Thu, 23 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cluster/LimayeLGSELC05.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccS/EngelmannG05,
  author       = {Christian Engelmann and
                  Al Geist},
  editor       = {Vaidy S. Sunderam and
                  G. Dick van Albada and
                  Peter M. A. Sloot and
                  Jack J. Dongarra},
  title        = {Super-Scalable Algorithms for Computing on 100, 000 Processors},
  booktitle    = {Computational Science - {ICCS} 2005, 5th International Conference,
                  Atlanta, GA, USA, May 22-25, 2005, Proceedings, Part {I}},
  series       = {Lecture Notes in Computer Science},
  volume       = {3514},
  pages        = {313--321},
  publisher    = {Springer},
  year         = {2005},
  url          = {https://doi.org/10.1007/11428831\_39},
  doi          = {10.1007/11428831\_39},
  timestamp    = {Tue, 14 May 2019 10:00:48 +0200},
  biburl       = {https://dblp.org/rec/conf/iccS/EngelmannG05.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ipps/EngelmannG05,
  author       = {Christian Engelmann and
                  Al Geist},
  title        = {A Lightweight Kernel for the Harness Metacomputing Framework},
  booktitle    = {19th International Parallel and Distributed Processing Symposium {(IPDPS}
                  2005), {CD-ROM} / Abstracts Proceedings, 4-8 April 2005, Denver, CO,
                  {USA}},
  publisher    = {{IEEE} Computer Society},
  year         = {2005},
  url          = {https://doi.org/10.1109/IPDPS.2005.34},
  doi          = {10.1109/IPDPS.2005.34},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/ipps/EngelmannG05.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/serp/SongLNLES05,
  author       = {Hertong Song and
                  Chokchai Leangsuksun and
                  Raja Nassar and
                  Yudan Liu and
                  Christian Engelmann and
                  Stephen L. Scott},
  editor       = {Hamid R. Arabnia and
                  Hassan Reza},
  title        = {UML-based Beowulf Cluster Availability Modeling},
  booktitle    = {Proceedings of the International Conference on Software Engineering
                  Research and Practice, {SERP} 2005, Las Vegas, Nevada, USA, June 27-29,
                  2005, Volume 1},
  pages        = {161--167},
  publisher    = {{CSREA} Press},
  year         = {2005},
  timestamp    = {Tue, 31 Jan 2006 11:57:16 +0100},
  biburl       = {https://dblp.org/rec/conf/serp/SongLNLES05.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/clade/EngelmannG03,
  author       = {Christian Engelmann and
                  Al Geist},
  title        = {A Diskless Checkpointing Algorithm for Super-scale Architectures Applied
                  to the Fast Fourier Transform},
  booktitle    = {1st International Workshop on Challenges of Large Applications in
                  Distributed Environments, CLADE@HPDC 2003, Seattle, WA, USA, June
                  21, 2003},
  pages        = {47},
  publisher    = {{IEEE} Computer Society},
  year         = {2003},
  url          = {https://doi.org/10.1109/CLADE.2003.1209999},
  doi          = {10.1109/CLADE.2003.1209999},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/clade/EngelmannG03.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccS/EngelmannSG02,
  author       = {Christian Engelmann and
                  Stephen L. Scott and
                  George Al Geist II},
  editor       = {Peter M. A. Sloot and
                  Chih Jeng Kenneth Tan and
                  Jack J. Dongarra and
                  Alfons G. Hoekstra},
  title        = {Distributed Peer-to-Peer Control in Harness},
  booktitle    = {Computational Science - {ICCS} 2002, International Conference, Amsterdam,
                  The Netherlands, April 21-24, 2002. Proceedings, Part {II}},
  series       = {Lecture Notes in Computer Science},
  volume       = {2330},
  pages        = {720--728},
  publisher    = {Springer},
  year         = {2002},
  url          = {https://doi.org/10.1007/3-540-46080-2\_76},
  doi          = {10.1007/3-540-46080-2\_76},
  timestamp    = {Tue, 14 May 2019 10:00:48 +0200},
  biburl       = {https://dblp.org/rec/conf/iccS/EngelmannSG02.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
a service of  Schloss Dagstuhl - Leibniz Center for Informatics