BibTeX Citations

December 10th, 2014
@article{snir14addressing,
  author        = "Marc Snir
                   and Robert W. Wisniewski
                   and Jacob A. Abraham
                   and Sarita V. Adve
                   and Saurabh Bagchi
                   and Pavan Balaji
                   and Jim Belak
                   and Pradip Bose
                   and Franck Cappello
                   and Bill Carlson
                   and Andrew A. Chien
                   and Paul Coteus
                   and Nathan A. Debardeleben
                   and Pedro Diniz
                   and Christian Engelmann
                   and Mattan Erez
                   and Saverio Fazzari
                   and Al Geist
                   and Rinku Gupta
                   and Fred Johnson
                   and Sriram Krishnamoorthy
                   and Sven Leyffer
                   and Dean Liberty
                   and Subhasish Mitra
                   and Todd Munson
                   and Rob Schreiber
                   and Jon Stearley
                   and Eric Van Hensbergen",
  title         = "Addressing Failures in Exascale Computing",
  journal       = "\href{http://hpc.sagepub.com}{International Journal of High Performance Computing Applications (IJHPCA)}",
  volume        = "28",
  number        = "2",
  pages         = "127--171",
  month         = may,
  year          = "2014",
  publisher     = "\href{http://www.sagepub.com}{SAGE Publications}",
  issn          = "1094-3420",
  doi           = "http://dx.doi.org/10.1177/1094342014522573",
  url           = "http://www.christian-engelmann.info/publications/snir14addressing.pdf",
  abstract      = "We present here a report produced by a workshop on 
                   Addressing failures in exascale computing' held in Park City, 
                   Utah, 4-11 August 2012. The charter of this workshop was to 
                   establish a common taxonomy about resilience across all the 
                   levels in a computing system, discuss existing knowledge on 
                   resilience across the various hardware and software layers 
                   of an exascale system, and build on those results, examining 
                   potential solutions from both a hardware and software 
                   perspective and focusing on a combined approach.
                   The workshop brought together participants with expertise in 
                   applications, system software, and hardware; they came from 
                   industry, government, and academia, and their interests ranged 
                   from theory to implementation. The combination allowed broad 
                   and comprehensive discussions and led to this document, which 
                   summarizes and builds on those discussions."
}
@article{engelmann13scaling,
  author        = "Christian Engelmann",
  title         = "Scaling To A Million Cores And Beyond: {Using} Light-Weight
                   Simulation to Understand The Challenges Ahead On The Road To
                   Exascale",
  journal       = "\href{http://www.elsevier.com/locate/fgcs}{Future Generation
                   Computer Systems (FGCS)}",
  volume        = "30",
  number        = "0",
  pages         = "59--65",
  month         = jan,
  year          = "2014",
  publisher     = "\href{http://www.elsevier.com}{Elsevier B.V, Amsterdam, The
                   Netherlands}",
  issn          = "0167-739X",
  doi           = "http://dx.doi.org/10.1016/j.future.2013.04.014",
  url           = "http://www.christian-engelmann.info/publications/engelmann13scaling.pdf",
  abstract      = "As supercomputers scale to 1,000 PFlop/s over the next
                   decade, investigating the performance of parallel
                   applications at scale on future architectures and the
                   performance impact of different architecture choices for
                   high-performance computing (HPC) hardware/software co-design
                   is crucial. This paper summarizes recent efforts in designing
                   and implementing a novel HPC hardware/software co-design
                   toolkit. The presented Extreme-scale Simulator (xSim) permits
                   running an HPC application in a controlled environment with
                   millions of concurrent execution threads while observing its
                   performance in a simulated extreme-scale HPC system using
                   architectural models and virtual timing. This paper
                   demonstrates the capabilities and usefulness of the xSim
                   performance investigation toolkit, such as its scalability
                   to $2^{27}$ simulated Message Passing Interface (MPI) ranks
                   on 960 real processor cores, the capability to evaluate the
                   performance of different MPI collective communication
                   algorithms, and the ability to evaluate the performance of
                   a basic Monte Carlo application with different architectural
                   parameters."
}
@article{wang12proactive,
  author        = "Chao Wang
                   and Frank Mueller
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Proactive Process-Level Live Migration and Back Migration in
                   {HPC} Environments",
  journal       = "\href{http://www.elsevier.com/locate/jpdc}{Journal of
                   Parallel and Distributed Computing (JPDC)}",
  volume        = "72",
  number        = "2",
  pages         = "254--267",
  month         = feb,
  year          = "2012",
  publisher     = "\href{http://www.elsevier.com}{Elsevier B.V, Amsterdam, The
                   Netherlands}",
  issn          = "0743-7315",
  doi           = "http://dx.doi.org/10.1016/j.jpdc.2011.10.009",
  url           = "http://www.christian-engelmann.info/publications/wang12proactive.pdf",
  abstract      = "As the number of nodes in high-performance computing
                   environments keeps increasing, faults are becoming common
                   place. Reactive fault tolerance (FT) often does not scale
                   due to massive I/O requirements and relies on manual job
                   resubmission.
                   This work complements reactive with proactive FT at the
                   process level. Through health monitoring, a subset of node
                   failures can be anticipated when one's health deteriorates.
                   A novel process-level live migration mechanism supports
                   continued execution of applications during much of process
                   migration. This scheme is integrated into an MPI execution
                   environment to transparently sustain health-inflicted node
                   failures, which eradicates the need to restart and requeue
                   MPI jobs. Experiments indicate that 1-6.5 s of prior warning
                   are required to successfully trigger live process migration
                   while similar operating system virtualization mechanisms
                   require 13-24 s. This self-healing approach complements
                   reactive FT by nearly cutting the number of checkpoints in
                   half when 70\% of the faults are handled proactively. The
                   work also provides a novel back migration approach to
                   eliminate load imbalance or bottlenecks caused by migrated
                   tasks. Experiments indicate the larger the amount of
                   outstanding execution, the higher the benefit due to back
                   migration.",
}
@article{scott10system,
  author        = "Stephen L. Scott
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Anand Tikotekar
                   and Christian Engelmann
                   and Hong H. Ong",
  title         = "System-Level Virtualization Research at {Oak Ridge National
                   Laboratory}",
  journal       = "\href{http://www.elsevier.com/locate/fgcs}{Future Generation
                   Computer Systems (FGCS)}",
  volume        = "26",
  number        = "3",
  pages         = "304--307",
  month         = mar,
  year          = "2010",
  publisher     = "\href{http://www.elsevier.com}{Elsevier B.V, Amsterdam, The
                   Netherlands}",
  issn          = "0167-739X",
  doi           = "http://dx.doi.org/10.1016/j.future.2009.07.001",
  url           = "http://www.christian-engelmann.info/publications/scott09system.pdf",
  abstract      = "System-level virtualization is today enjoying a rebirth as a
                   technique to effectively share what were then considered
                   large computing resources to subsequently fade from the
                   spotlight as individual workstations gained in popularity
                   with a one machine -- one user approach. One reason for
                   this resurgence is that the simple workstation has grown in
                   capability to rival that of anything available in the past.
                   Thus, computing centers are again looking at the
                   price/performance benefit of sharing that single computing
                   box via server consolidation. However, industry is only
                   concentrating on the benefits of using virtualization for
                   server consolidation (enterprise computing) whereas our
                   interest is in leveraging virtualization to advance
                   high-performance computing (HPC). While these two interests
                   may appear to be orthogonal, one consolidating multiple
                   applications and users on a single machine while the other
                   requires all the power from many machines to be dedicated
                   solely to its purpose, we propose that virtualization does
                   provide attractive capabilities that may be exploited to the
                   benefit of HPC interests. This does raise the two fundamental
                   questions of: is the concept of virtualization (a machine
                   sharing technology) really suitable for HPC and if so,
                   how does one go about leveraging these virtualization
                   capabilities for the benefit of HPC. To address these
                   questions, this document presents ongoing studies on the
                   usage of system-level virtualization in a HPC context. These
                   studies include an analysis of the benefits of system-level
                   virtualization for HPC, a presentation of research efforts
                   based on virtualization for system availability, and a
                   presentation of research efforts for the management of
                   virtual systems. The basis for this document was material
                   presented by Stephen L. Scott at the Collaborative and Grid
                   Computing Technologies meeting held in Cancun, Mexico on
                   April 12-14, 2007."
}
@article{he09symmetric,
  author        = "Xubin (Ben) He
                   and Li Ou
                   and Christian Engelmann
                   and Xin Chen
                   and Stephen L. Scott",
  title         = "Symmetric Active/Active Metadata Service for High
                   Availability Parallel File Systems",
  journal       = "\href{http://www.elsevier.com/locate/jpdc}{Journal of
                   Parallel and Distributed Computing (JPDC)}",
  volume        = "69",
  number        = "12",
  pages         = "961-973",
  month         =  dec,
  year          = "2009",
  publisher     = "\href{http://www.elsevier.com}{Elsevier B.V, Amsterdam, The
                   Netherlands}",
  issn          = "0743-7315",
  doi           = "http://dx.doi.org/10.1016/j.jpdc.2009.08.004",
  url           = "http://www.christian-engelmann.info/publications/he09symmetric.pdf",
  abstract      = "High availability data storage systems are critical for many
                   applications as research and business become more
                   data-driven. Since metadata management is essential to
                   system availability, multiple metadata services are used to
                   improve the availability of distributed storage systems.
                   Past research focused on the active/standby model, where
                   each active service has at least one redundant idle backup.
                   However, interruption of service and even some loss of
                   service state may occur during a fail-over depending on the
                   used replication technique. In addition, the replication
                   overhead for multiple metadata services can be very high.
                   The research in this paper targets the symmetric
                   active/active replication model, which uses multiple
                   redundant service nodes running in virtual synchrony. In
                   this model, service node failures do not cause a fail-over
                   to a backup and there is no disruption of service or loss
                   of service state. We further discuss a fast delivery
                   protocol to reduce the latency of the needed total order
                   broadcast. Our prototype implementation shows that
                   metadata service high availability can be achieved with
                   an acceptable performance trade-off using our symmetric
                   active/active metadata service solution."
}
@article{he07unified,
  author        = "Xubin (Ben) He
                   and Li Ou
                   and Martha J. Kosa
                   and Stephen L. Scott
                   and Christian Engelmann",
  title         = "A Unified Multiple-Level Cache for High Performance Cluster
                   Storage Systems",
  journal       = "\href{http://www.inderscience.com/browse/index.php?journalcode=ijhpcn}
                   {International Journal of High Performance Computing and
                   Networking (IJHPCN)}",
  volume        = "5",
  number        = "1-2",
  pages         = "97--109",
  year          = "2007",
  publisher     = "\href{http://www.inderscience.com}{Inderscience Publishers,
                   Geneve, Switzerland}",
  issn          = "1740-0562",
  doi           = "http://dx.doi.org/10.1504/IJHPCN.2007.015768",
  url           = "http://www.christian-engelmann.info/publications/he07unified.pdf",
  abstract      = "Highly available data storage for high-performance computing
                   is becoming increasingly more critical as high-end computing
                   systems scale up in size and storage systems are developed
                   around network-centered architectures. A promising solution
                   is to harness the collective storage potential of individual
                   workstations much as we harness idle CPU cycles due to the
                   excellent price/performance ratio and low storage usage of
                   most commodity workstations. For such a storage system,
                   metadata consistency is a key issue assuring storage system
                   availability as well as data reliability. In this paper, we
                   present a decentralized metadata management scheme that
                   improves storage availability without sacrificing
                   performance."
}
@article{engelmann06symmetric,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Chokchai (Box) Leangsuksun
                   and Xubin (Ben) He",
  title         = "Symmetric Active/Active High Availability for
                   High-Performance Computing System Services",
  journal       = "\href{http://www.academypublisher.com/jcp}{Journal of
                   Computers (JCP)}",
  volume        = "1",
  number        = "8",
  pages         = "43--54",
  month         = dec,
  year          = "2006",
  publisher     = "\href{http://www.academypublisher.com}{Academy Publisher,
                   Oulu, Finland}",
  issn          = "1796-203X",
  doi           = "http://www.academypublisher.com/jcp/vol01/no08/jcp01084354.html",
  url           = "http://www.christian-engelmann.info/publications/engelmann06symmetric.pdf",
  abstract      = "This work aims to pave the way for high availability in
                   high-performance computing (HPC) by focusing on efficient
                   redundancy strategies for head and service nodes. These nodes
                   represent single points of failure and control for an entire
                   HPC system as they render it inaccessible and unmanageable in
                   case of a failure until repair. The presented approach
                   introduces two distinct replication methods, internal and
                   external, for providing symmetric active/active high
                   availability for multiple redundant head and service nodes
                   running in virtual synchrony utilizing an existing process
                   group communication system for service group membership
                   management and reliable, totally ordered message delivery.
                   Resented results of a prototype implementation that offers
                   symmetric active/active replication for HPC job and resource
                   management using external replication show that the highest
                   level of availability can be provided with an acceptable
                   performance trade-off."
}
@article{engelmann06molar,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and David E. Bernholdt
                   and Narasimha R. Gottumukkala
                   and Chokchai (Box) Leangsuksun
                   and Jyothish Varma
                   and Chao Wang
                   and Frank Mueller
                   and Aniruddha G. Shet
                   and Ponnuswamy (Saday) Sadayappan",
  title         = "{MOLAR}: {A}daptive Runtime Support for High-End Computing
                   Operating and Runtime Systems",
  journal       = "\href{http://www.sigops.org/osr.html}{ACM SIGOPS Operating
                   Systems Review (OSR)}",
  volume        = "40",
  number        = "2",
  pages         = "63--72",
  month         = apr,
  year          = "2006",
  publisher     = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
  issn          = "0163-5980",
  doi           = "http://doi.acm.org/10.1145/1131322.1131337",
  url           = "http://www.christian-engelmann.info/publications/engelmann06molar.pdf",
  abstract      = "MOLAR is a multi-institutional research effort that
                   concentrates on adaptive, reliable, and efficient operating
                   and runtime system (OS/R) solutions for ultra-scale,
                   high-end scientific computing on the next generation of
                   supercomputers. This research addresses the challenges
                   outlined in FAST-OS (forum to address scalable technology for
                   runtime and operating systems) and HECRTF (high-end computing
                   revitalization task force) activities by exploring the use of
                   advanced monitoring and adaptation to improve application
                   performance and predictability of system interruptions, and
                   by advancing computer reliability, availability and
                   serviceability (RAS) management systems to work cooperatively
                   with the OS/R to identify and preemptively resolve system
                   issues. This paper describes recent research of the MOLAR
                   team in advancing RAS for high-end computing OS/Rs."
}
@conference{engelmann14improving,
  author        = "Christian Engelmann
                   and Thomas Naughton",
  title         = "Improving the Performance of the Extreme-scale Simulator",
  booktitle     = "Proceedings of the \href{http://ds-rt.com/2014}{$18^{th}$
                   IEEE/ACM International Symposium on Distributed Simulation
                   and Real Time Applications (DS-RT) 2014}",
  pages         = "198--207",
  month         = oct # "~1-3, ",
  year          = "2014",
  address       = "Toulouse, France",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  issn          = "1550-6525",
  isbn          = "978-1-4799-6143-6",
  doi           = "http://dx.doi.org/10.1109/DS-RT.2014.32",
  url           = "http://www.christian-engelmann.info/publications/engelmann14improving.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann14improving.ppt.pdf",
  abstract      = "Investigating the performance of parallel applications at
                   scale on future high-performance computing (HPC) architectures
                   and the performance impact of different architecture choices
                   is an important component of HPC hardware/software co-design.
                   The Extreme-scale Simulator (xSim) is a simulation-based toolkit
                   for investigating the performance of parallel applications at
                   scale. xSim scales to millions of simulated Message Passing
                   Interface (MPI) processes. The overhead introduced by a
                   simulation tool is an important performance and productivity
                   aspect. This paper documents two improvements to xSim: (1) a
                   new deadlock resolution protocol to reduce the parallel
                   discrete event simulation management overhead and (2) a new
                   simulated MPI message matching algorithm to reduce the
                   oversubscription management overhead. The results clearly
                   show a significant performance improvement, such as by reducing
                   the simulation overhead for running the NAS Parallel Benchmark
                   suite inside the simulator from 1,020\% to 238\% for the
                   conjugate gradient (CG) benchmark and from 102\% to 0\% for
                   the embarrassingly parallel (EP) and benchmark, as well as,
                   from 37,511\% to 13,808\% for CG and from 3,332\% to 204\% for
                   EP with accurate process failure simulation."
}
@conference{naughton14supporting,
  author        = "Thomas Naughton
                   and Christian Engelmann
                   and Geoffroy Vall{\'e}e
                   and Swen B{\"o}hm",
  title         = "Supporting the Development of Resilient Message Passing
                   Applications using Simulation",
  booktitle     = "Proceedings of the \href{http://www.pdp2014.org}{$22^{nd}$
                   Euromicro International Conference on Parallel, Distributed,
                   and network-based Processing (PDP) 2014}",
  pages         = "271--278",
  month         = feb # "~12-14, ",
  year          = "2014",
  address       = "Turin, Italy",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  issn          = "1066-6192",
  doi           = "http://dx.doi.org/10.1109/PDP.2014.74",
  url           = "http://www.christian-engelmann.info/publications/naughton14supporting.pdf",
  url2          = "http://www.christian-engelmann.info/publications/naughton14supporting.ppt.pdf",
  abstract      = "An emerging aspect of high-performance computing (HPC)
                   hardware/software co-design is investigating performance
                   under failure. The work in this paper extends the
                   Extreme-scale Simulator (xSim), which was designed for
                   evaluating the performance of message passing interface
                   (MPI) applications on future HPC architectures, with
                   fault-tolerant MPI extensions proposed by the MPI Fault
                   Tolerance Working Group. xSim permits running MPI
                   applications with millions of concurrent MPI ranks, while
                   observing application performance in a simulated
                   extreme-scale system using a lightweight parallel discrete
                   event simulation. The newly added features offer user-level
                   failure mitigation (ULFM) extensions at the simulated MPI
                   layer to support algorithm-based fault tolerance (ABFT).
                   The presented solution permits investigating performance
                   under failure and failure handling of ABFT solutions.
                   The newly enhanced xSim is the very first performance tool
                   that supports ULFM and ABFT.",
  note          = "Acceptance rate 32.6\% (73/224)"
}
@conference{vallee13runtime,
  author        = "Geoffroy Vall{\'e}e
                   and Thomas Naughton
                   and Swen B{\"o}hm
                   and Christian Engelmann",
  title         = "A Runtime Environment for Supporting Research in Resilient
                   {HPC} System Software & Tools",
  booktitle     = "Proceedings of the \href{http://is-candar.org}
                   {$1^{st}$ International Symposium on Computing and
                   Networking - Across Practical Development and Theoretical
                   Research - (CANDAR) 2013}",
  pages         = "213--219",
  month         = dec # "~4-6, ",
  year          = "2013",
  address       = "Matsuyama, Japan",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-1-4799-2795-1",
  doi           = "http://dx.doi.org/10.1109/CANDAR.2013.38",
  url           = "http://www.christian-engelmann.info/publications/vallee13runtime.pdf",
  url2          = "http://www.christian-engelmann.info/publications/vallee13runtime.ppt.pdf",
  abstract      = "The high-performance computing~(HPC) community continues to
                   increase the size and complexity of hardware platforms that
                   support advanced scientific workloads. The runtime
                   environment (RTE) is a crucial layer in the software
                   stack for these large-scale systems. The RTE manages the
                   interface between the operating system and the application
                   running in parallel on the machine. The deployment of
                   applications and tools on large-scale HPC computing systems
                   requires the RTE to manage process creation in a scalable
                   manner, support sparse connectivity, and provide fault
                   tolerance. We have developed a new RTE that provides a basis
                   for building distributed execution environments and
                   developing tools for HPC to aid research in system software
                   and resilience. This paper describes the software
                   architecture of the Scalable runTime Component
                   Infrastructure~(STCI), which is intended to provide a
                   complete infrastructure for scalable start-up and
                   management of many processes in large-scale HPC systems. We
                   highlight features of the current implementation, which is
                   provided as a system library that allows developers to easily
                   use and integrate STCI in their tools and/or applications.
                   The motivation for this work has been to support ongoing
                   research activities in fault-tolerance for large-scale
                   systems. We discuss the advantages of the modular framework
                   employed and describe two use cases that demonstrate its
                   capabilities: (i) an alternate runtime for a Message Passing
                   Interface (MPI) stack, and (ii) a distributed control and
                   communication substrate for a fault-injection tool.",
  note          = "Acceptance rate 35.8\% (28/78)"
}
@conference{engelmann13investigating,
  author        = "Christian Engelmann",
  title         = "Investigating Operating System Noise in Extreme-Scale
                   High-Performance Computing Systems using Simulation",
  booktitle     = "Proceedings of the
                   \href{http://www.iasted.org/conferences/home-795.html}
                   {$11^{th}$ IASTED International Conference on Parallel and
                   Distributed Computing and Networks (PDCN) 2013}",
  month         = feb # "~11-13, ",
  year          = "2013",
  address       = "Innsbruck, Austria",
  publisher     = "\href{http://www.actapress.com}{ACTA Press, Calgary, AB,
                   Canada}",
  isbn          = "978-0-88986-943-1",
  url           = "http://www.christian-engelmann.info/publications/engelmann12investigating.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann12investigating.ppt.pdf",
  abstract      = "Hardware/software co-design for future-generation
                   high-performance computing (HPC) systems aims at closing
                   the gap between the peak capabilities of the hardware
                   and the performance realized by applications
                   (application-architecture performance gap). Performance
                   profiling of architectures and applications is a crucial
                   part of this iterative process. The work in this paper
                   focuses on operating system (OS) noise as an additional
                   factor to be considered for co-design. It represents the
                   first step in including OS noise in HPC hardware/software
                   co-design by adding a noise injection feature to an existing
                   simulation-based co-design toolkit. It reuses an existing
                   abstraction for OS noise with frequency (periodic recurrence)
                   and period (duration of each occurrence) to enhance the
                   processor model of the Extreme-scale Simulator (xSim) with
                   synchronized and random OS noise simulation. The results
                   demonstrate this capability by evaluating the impact of OS
                   noise on MPI\_Bcast() and MPI\_Reduce() in a simulated
                   future-generation HPC system with 2,097,152 compute nodes."
}
@conference{fiala12detection2,
  author        = "David Fiala
                   and Frank Mueller
                   and Christian Engelmann
                   and Kurt Ferreira
                   and Ron Brightwell
                   and Rolf Riesen",
  title         = "Detection and Correction of Silent Data Corruption for
                   Large-Scale High-Performance Computing",
  booktitle     = "Proceedings of the
                   \href{http://sc12.supercomputing.org}{$25^{th}$ IEEE/ACM
                   International Conference on High Performance Computing,
                   Networking, Storage and Analysis (SC) 2012}",
  pages         = "78:1--78:12",
  month         = nov # "~10-16, ",
  year          = "2012",
  address       = "Salt Lake City, UT, USA",
  publisher     = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
  isbn          = "978-1-4673-0804-5",
  url           = "http://www.christian-engelmann.info/publications/fiala12detection2.pdf",
  url2          = "http://www.christian-engelmann.info/publications/fiala12detection2.ppt.pdf",
  abstract      = "Faults have become the norm rather than the exception for
                   high-end computing on clusters with 10s/100s of thousands of
                   cores. Exacerbating this situation, some of these faults
                   remain undetected, manifesting themselves as silent errors
                   that corrupt memory while applications continue to operate
                   and report incorrect results.
                   This paper studies the potential for redundancy to both
                   detect and correct soft errors in MPI message-passing
                   applications. Our study investigates the challenges inherent
                   to detecting soft errors within MPI application while
                   providing transparent MPI redundancy. By assuming a model
                   wherein corruption in application data manifests itself by
                   producing differing MPI message data between replicas, we
                   study the best suited protocols for detecting and correcting
                   MPI data that is the result of corruption.
                   To experimentally validate our proposed detection and
                   correction protocols, we introduce RedMPI, an MPI library
                   which resides in the MPI profiling layer. RedMPI is capable
                   of both online detection and correction of soft errors that
                   occur in MPI applications without requiring any
                   modifications to the application source by utilizing either
                   double or triple redundancy.
                   Our results indicate that our most efficient consistency
                   protocol can successfully protect applications experiencing
                   even high rates of silent data corruption with runtime
                   overheads between 0\% and 30\% as compared to unprotected
                   applications without redundancy.
                   Using our fault injector within RedMPI, we observe that even
                   a single soft error can have profound effects on running
                   applications, causing a cascading pattern of corruption in
                   most cases causes that spreads to all other processes.
                   RedMPI's protection has been shown to successfully mitigate
                   the effects of soft errors while allowing applications to
                   complete with correct results even in the face of errors.",
  note          = "Acceptance rate 21.2\% (100/472)"
}
@conference{elliott12combining,
  author        = "James Elliott
                   and Kishor Kharbas
                   and David Fiala
                   and Frank Mueller
                   and Kurt Ferreira
                   and Christian Engelmann",
  title         = "Combining Partial Redundancy and Checkpointing for {HPC}",
  booktitle     = "Proceedings of the \href{http://icdcs-2012.org/}
                   {$32^{nd}$ International Conference on Distributed
                   Computing Systems (ICDCS) 2012}",
  pages         = "615--626",
  month         = jun # "~18-21, ",
  year          = "2012",
  address       = "Macau, SAR, China",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-4685-8",
  issn          = "1063-6927",
  doi           = "http://dx.doi.org/10.1109/ICDCS.2012.56",
  url           = "http://www.christian-engelmann.info/publications/elliott12combining.pdf",
  url2          = "http://www.christian-engelmann.info/publications/elliott12combining.ppt.pdf",
  abstract      = "Today's largest High Performance Computing (HPC) systems
                   exceed one Petaflops (10^15 floating point operations per
                   second) and exascale systems are projected within seven
                   years. But reliability is becoming one of the major
                   challenges faced by exascale computing. With billion-core
                   parallelism, the mean time to failure is projected to be in
                   the range of minutes or hours instead of days. Failures are
                   becoming the norm rather than the exception during execution
                   of HPC applications. Current fault tolerance techniques in
                   HPC focus on reactive ways to mitigate faults, namely via
                   checkpoint and restart (C/R). Apart from storage overheads,
                   C/R-based fault recovery comes at an additional cost in
                   terms of application performance because normal execution
                   is disrupted when checkpoints are taken. Studies have shown
                   that applications running at a large scale spend more than
                   50\% of their total time saving checkpoints, restarting and
                   redoing lost work. Redundancy is another fault tolerance
                   technique, which employs redundant processes performing the
                   same task. If a process fails, a replica of it can take over
                   its execution. Thus, redundant copies can decrease the
                   overall failure rate. The downside of redundancy is that
                   extra resources are required and there is an additional
                   overhead on communication and synchronization. This work
                   contributes a model and analyzes the benefit of C/R in
                   coordination with redundancy at different degrees to
                   minimize the total wallclock time and resources utilization
                   of HPC applications. We further conduct experiments with an
                   implementation of redundancy within the MPI layer on a
                   cluster. Our experimental results confirm the benefit of dual
                   and triple redundancy - but not for partial redundancy - and
                   show a close fit to the model. At 80,000 processes, dual
                   redundancy requires twice the number of processing resources
                   for an application but allows two jobs of 128 hours wallclock
                   time to finish within the time of just one job without
                   redundancy. For narrow ranges of processor counts, partial
                   redundancy results in the lowest time. Once the count exceeds
                   770, 000, triple redundancy has the lowest overall cost.
                   Thus, redundancy allows one to trade-off additional resource
                   requirements against wallclock time, which provides a tuning
                   knob for users to adapt to resource availabilities.",
  note          = "Acceptance rate 13.8\% (71/515)"
}
@conference{wang12nvmalloc,
  author        = "Chao Wang
                   and Sudharshan S. Vazhkudai
                   and Xiaosong Ma
                   and Fei Meng
                   and Youngjae Kim
                   and Christian Engelmann",
  title         = "{NVMalloc}: Exposing an Aggregate {SSD} Store as a Memory
                   Partition in Extreme-Scale Machines",
  booktitle     = "Proceedings of the \href{http://www.ipdps.org}
                   {$26^{th}$ IEEE International Parallel and Distributed
                   Processing Symposium (IPDPS) 2012}",
  pages         = "957--968",
  month         = may # "~21-25, ",
  year          = "2012",
  address       = "Shanghai, China",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-4675-9",
  doi           = "http://dx.doi.org/10.1109/IPDPS.2012.90",
  url           = "http://www.christian-engelmann.info/publications/wang12nvmalloc.pdf",
  url2          = "http://www.christian-engelmann.info/publications/wang12nvmalloc.ppt.pdf",
  abstract      = "DRAM is a precious resource in extreme-scale machines and is
                   increasingly becoming scarce, mainly due to the growing
                   number of cores per node. On future multi-petaflop and
                   exaflop machines, the memory pressure is likely to be so
                   severe that we need to rethink our memory usage models.
                   Fortunately, the advent of non-volatile memory (NVM) offers
                   a unique opportunity in this space. Current NVM offerings
                   possess several desirable properties, such as low cost and
                   power efficiency, but also suffer from high latency and
                   lifetime issues. We need rich techniques to be able to use
                   them alongside DRAM. In this paper, we propose a novel
                   approach to exploiting NVM as a secondary memory partition
                   so that applications can explicitly allocate and manipulate
                   memory regions therein. More specifically, we propose an
                   NVMalloc library with a suite of services that enables
                   applications to access a distributed NVM storage system.
                   We have devised ways within NVMalloc so that the storage
                   system, built from compute node-local NVM devices, can be
                   accessed in a byte-addressable fashion using the memory
                   mapped I/O interface. Our approach has the potential to
                   re-energize out-of-core computations on large-scale machines
                   by having applications allocate certain variables through
                   NVMalloc, thereby increasing the overall memory available
                   for the application. Our evaluation on a 128-core cluster
                   shows that NVMalloc enables applications to compute problem
                   sizes larger than the physical memory in a cost-effective
                   manner. It can achieve better performance with increased
                   computation time between NVM memory accesses or increased
                   data access locality. In addition, our results suggest that
                   while NVMalloc enables transparent access to NVM-resident
                   variables, the explicit control it provides is crucial to
                   optimize application performance.",
  note          = "Acceptance rate 20.7\% (118/569)"
}
@conference{boehm12file,
  author        = "Swen B{\"o}hm and
                   Christian Engelmann",
  title         = "File {I/O} for {MPI} Applications in Redundant Execution
                   Scenarios",
  booktitle     = "Proceedings of the \href{http://www.pdp2012.org}{$20^{th}$
                   Euromicro International Conference on Parallel, Distributed,
                   and network-based Processing (PDP) 2012}",
  pages         = "112-119",
  month         = feb # "~15-17, ",
  year          = "2012",
  address       = "Garching, Germany",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-4633-9",
  issn          = "1066-6192",
  doi           = "http://dx.doi.org/10.1109/PDP.2012.22",
  url           = "http://www.christian-engelmann.info/publications/boehm12file.pdf",
  url2          = "http://www.christian-engelmann.info/publications/boehm12file.ppt.pdf",
  abstract      = "As multi-petascale and exa-scale high-performance computing
                   (HPC) systems inevitably have to deal with a number of
                   resilience challenges, such as a significant growth in
                   component count and smaller circuit sizes with lower circuit
                   voltages, redundancy may offer an acceptable level of
                   resilience that traditional fault tolerance techniques, such
                   as checkpoint/restart, do not. Although redundancy in HPC is
                   quite controversial due to the associated cost for redundant
                   components,  the constantly increasing number of
                   cores-per-processor is tilting this cost calculation toward
                   a system design where computation, such as for redundancy,
                   is much cheaper and communication, needed for
                   checkpoint/restart, is much more expensive. Recent research
                   and development activities in redundancy for Message Passing
                   Interface (MPI) applications focused on
                   availability/reliability models and replication algorithms.
                   This paper takes a first step toward solving an open research
                   problem associated with running a parallel application
                   redundantly, which is file I/O under redundancy. The
                   approach intercepts file I/O calls made by a redundant
                   application to employ coordination protocols that execute
                   file I/O operations in a redundancy-oblivious fashion when
                   accessing a node-local file system, or in a redundancy-aware
                   fashion when accessing a shared networked file system.
                   A proof-of concept prototype is presented and a number of
                   coordination protocols are described and evaluated. The
                   results show the performance impact for redundantly accessing
                   a shared networked file system, but also demonstrate the
                   capability to regain performance by utilizing MPI
                   communication between replicas and parallel file I/O."
}
@conference{boehm11xsim,
  author        = "Swen B{\"o}hm
                   and Christian Engelmann",
  title         = "{xSim}: {The} Extreme-Scale Simulator",
  booktitle     = "Proceedings of the
                   \href{http://hpcs11.cisedu.info}{International Conference on
                   High Performance Computing and Simulation (HPCS) 2011}",
  pages         = "280-286",
  month         = jul # "~4-8, ",
  year          = "2011",
  address       = "Istanbul, Turkey",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-1-61284-383-4",
  doi           = "http://dx.doi.org/10.1109/HPCSim.2011.5999835",
  url           = "http://www.christian-engelmann.info/publications/boehm11xsim.pdf",
  url2          = "http://www.christian-engelmann.info/publications/boehm11xsim.ppt.pdf",
  abstract      = "Investigating parallel application performance properties at
                   scale is becoming an important part of high-performance
                   computing (HPC) application development and deployment. The
                   Extreme-scale Simulator (xSim) is a performance investigation
                   toolkit that permits running an application in a controlled
                   environment at extreme scale without the need for a
                   respective extreme-scale HPC system. Using a lightweight
                   parallel discrete event simulation, xSim executes a parallel
                   application with a virtual wall clock time, such that
                   performance data can be extracted based on a processor model
                   and a network model. This paper presents significant
                   enhancements to the xSim toolkit prototype that provide a
                   more complete Message Passing Interface (MPI) support and
                   improve its versatility. These enhancements include full
                   virtual MPI group, communicator and collective communication
                   support, and global variables support. The new capabilities
                   are demonstrated by executing the entire NAS Parallel
                   Benchmark suite in a simulated HPC environment.",
  note          = "Acceptance rate 28.1\% (48/171)"
}
@conference{engelmann11redundant,
  author        = "Christian Engelmann
                   and Swen B{\"o}hm",
  title         = "Redundant Execution of {HPC} Applications with {MR-MPI}",
  booktitle     = "Proceedings of the
                   \href{http://www.iasted.org/conferences/home-719.html}
                   {$10^{th}$ IASTED International Conference on Parallel and
                   Distributed Computing and Networks (PDCN) 2011}",
  pages         = "31--38",
  month         = feb # "~15-17, ",
  year          = "2011",
  address       = "Innsbruck, Austria",
  publisher     = "\href{http://www.actapress.com}{ACTA Press, Calgary, AB,
                   Canada}",
  isbn          = "978-0-88986-864-9",
  doi           = "http://dx.doi.org/10.2316/P.2011.719-031",
  url           = "http://www.christian-engelmann.info/publications/engelmann11redundant.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann11redundant.ppt.pdf",
  abstract      = "This paper presents a modular-redundant Message Passing
                   Interface (MPI) solution, MR-MPI, for transparently executing 
                   high-performance computing (HPC) applications in a redundant
                   fashion. The presented work addresses the deficiencies of
                   recovery-oriented HPC, i.e., checkpoint/restart to/from a
                   parallel file system, at extreme scale by adding the
                   redundancy approach to the HPC resilience portfolio. It
                   utilizes the MPI performance tool interface, PMPI, to
                   transparently intercept MPI calls from an application and to
                   hide all redundancy-related mechanisms. A redundantly
                   executed application runs with $r*m$ native MPI processes,
                   where $r$ is the number of MPI ranks visible to the
                   application and $m$ is the replication degree. Messages
                   between redundant nodes are replicated. Partial replication
                   for tunable resilience is supported. The performance results
                   clearly show the negative impact of the O(m^2) messages
                   between replicas. For low-level, point-to-point benchmarks,
                   the impact can be as high as the replication degree. For
                   applications, performance highly depends on the actual
                   communication types and counts. On single-core systems, the
                   overhead can be 0\% for embarrassingly parallel applications
                   independent of the employed redundancy configuration or up
                   to 70-90\% for communication-intensive applications in a
                   dual-redundant configuration. On multi-core systems, the
                   overhead can be significantly higher due to the additional
                   communication contention."
}
@conference{wang10hybrid2,
  author        = "Chao Wang
                   and Frank Mueller
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Hybrid Checkpointing for {MPI} Jobs in {HPC} Environments",
  booktitle     = "Proceedings of the
                   \href{http://grid.sjtu.edu.cn/icpads10}{$16^{th}$ IEEE
                   International Conference on Parallel and Distributed Systems
                   (ICPADS) 2010}",
  pages         = "524--533",
  month         = dec # "~8-10, ",
  year          = "2010",
  address       = "Shanghai, China",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-4307-9",
  doi           = "http://dx.doi.org/10.1109/ICPADS.2010.48",
  url           = "http://www.christian-engelmann.info/publications/wang10hybrid2.pdf",
  url2          = "http://www.christian-engelmann.info/publications/wang10hybrid2.ppt.pdf",
  abstract      = "As the core count in high-performance computing systems keeps
                   increasing, faults are becoming common place. Check pointing
                   addresses such faults but captures full process images even
                   though only a subset of the process image changes between
                   checkpoints. We have designed a hybrid check pointing
                   technique for MPI tasks of high-performance applications.
                   This technique alternates between full and incremental
                   checkpoints: At incremental checkpoints, only data changed
                   since the last checkpoint is captured. Our implementation
                   integrates new BLCR and LAM/MPI features that complement
                   traditional full checkpoints. This results in significantly
                   reduced checkpoint sizes and overheads with only moderate
                   increases in restart overhead. After accounting for cost and
                   savings, benefits due to incremental checkpoints are an order
                   of magnitude larger than overheads on restarts. We further
                   derive qualitative results indicating an optimal balance
                   between full/incremental checkpoints of our novel approach at
                   a ratio of 1:9, which outperforms both always-full and
                   always-incremental check pointing.",
  note          = "Acceptance rate 29.6\% (77/188)"
}
@conference{li10functional,
  author        = "Min Li
                   and Sudharshan S. Vazhkudai
                   and Ali R. Butt
                   and Fei Meng
                   and Xiaosong Ma
                   and Youngjae Kim
                   and Christian Engelmann
                   and Galen Shipman",
  title         = "Functional Partitioning to Optimize End-to-End Performance on
                   Many-Core Architectures",
  booktitle     = "Proceedings of the
                   \href{http://sc10.supercomputing.org}{$23^{rd}$ IEEE/ACM
                   International Conference on High Performance Computing,
                   Networking, Storage and Analysis (SC) 2010}",
  pages         = "1-12",
  month         = nov # "~13-19, ",
  year          = "2010",
  address       = "New Orleans, LA, USA",
  publisher     = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
  isbn          = "978-1-4244-7559-9",
  doi           = "http://dx.doi.org/10.1109/SC.2010.28",
  url           = "http://www.christian-engelmann.info/publications/li10functional.pdf",
  url2          = "http://www.christian-engelmann.info/publications/li10functional.ppt.pdf",
  abstract      = "Scaling computations on emerging massive-core supercomputers
                   is a daunting task, which coupled with the significantly
                   lagging system I/O capabilities exacerbates applications'
                   end-to-end performance. The I/O bottleneck often negates
                   potential performance benefits of assigning additional
                   compute cores to an application. In this paper, we address
                   this issue via a novel functional partitioning (FP) runtime
                   environment that allocates cores to specific application
                   tasks - checkpointing, de-duplication, and scientific data
                   format transformation - so that the deluge of cores can be
                   brought to bear on the entire gamut of application
                   activities. The focus is on utilizing the extra cores to
                   support HPC application I/O activities and also leverage
                   solid-state disks in this context. For example, our
                   evaluation shows that dedicating 1 core on an oct-core
                   machine for checkpointing and its assist tasks using FP can
                   improve overall execution time of a FLASH benchmark on 80 and 
                   160 cores by 43.95\% and 41.34\%, respectively.",
  note          = "Acceptance rate 19.8\% (50/253)"
}
@conference{boehm10aggregation,
  author        = "Swen B{\"o}hm
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Aggregation of Real-Time System Monitoring Data for Analyzing
                   Large-Scale Parallel and Distributed Computing Environments",
  booktitle     = "Proceedings of the \href{http://www.anss.org.au/hpcc2010}
                   {$12^{th}$ IEEE International Conference on High Performance
                   Computing and Communications (HPCC) 2010}",
  pages         = "72--78",
  month         = sep # "~1-3, ",
  year          = "2010",
  address       = "Melbourne, Australia",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-4214-0",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/HPCC.2010.32",
  url           = "http://www.christian-engelmann.info/publications/boehm10aggregation.pdf",
  url2          = "http://www.christian-engelmann.info/publications/boehm10aggregation.ppt.pdf",
  abstract      = "We present a monitoring system for large-scale parallel and
                   distributed computing environments that allows to trade-off
                   accuracy in a tunable fashion to gain scalability without
                   compromising fidelity. The approach relies on classifying
                   each gathered monitoring metric based on individual needs
                   and on aggregating messages containing classes of individual
                   monitoring metrics using a tree-based overlay network. The
                   MRNet-based prototype is able to significantly reduce the
                   amount of gathered and stored monitoring data, e.g., by a
                   factor of ~56 in comparison to the Ganglia distributed
                   monitoring system. A simple scaling study reveals, however,
                   that further efforts are needed in reducing the amount of
                   data to monitor future-generation extreme-scale systems with
                   up to 1,000,000 nodes. The implemented solution did not had
                   a measurable performance impact as the 32-node test system
                   did not produce enough monitoring data to interfere with
                   running applications.",
  note          = "Acceptance rate 19.1\% (58/304)"
}
@conference{litvinova10proactive,
  author        = "Antonina Litvinova
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "A Proactive Fault Tolerance Framework for High-Performance
                   Computing",
  booktitle     = "Proceedings of the
                   \href{http://www.iasted.org/conferences/home-676.html}
                   {$9^{th}$ IASTED International Conference on Parallel and
                   Distributed Computing and Networks (PDCN) 2010}",
  pages         = "",
  month         = feb # "~16-18, ",
  year          = "2010",
  address       = "Innsbruck, Austria",
  publisher     = "\href{http://www.actapress.com}{ACTA Press, Calgary, AB,
                   Canada}",
  isbn          = "978-0-88986-783-3",
  doi           = "http://www.actapress.com/Abstract.aspx?paperId=37915",
  url           = "http://www.christian-engelmann.info/publications/litvinova10proactive.pdf",
  url2          = "http://www.christian-engelmann.info/publications/litvinova10proactive.ppt.pdf",
  abstract      = "As high-performance computing (HPC) systems continue to
                   increase in scale, their mean-time to interrupt decreases
                   respectively. The current state of practice for fault
                   tolerance (FT) is checkpoint/restart. However, with
                   increasing error rates, increasing aggregate memory and not
                   proportionally increasing I/O capabilities, it is becoming
                   less efficient. Proactive FT avoids experiencing failures
                   through preventative measures, such as by migrating
                   application parts away from nodes that are about to fail.
                   This paper presents a proactive FT framework that performs
                   environmental monitoring, event logging, parallel job
                   monitoring and resource monitoring to analyze HPC system
                   reliability and to perform FT through such preventative
                   actions."
}
@conference{taerat09blue,
  author        = "Narate Taerat
                   and Nichamon Naksinehaboon
                   and Clayton Chandler
                   and James Elliott
                   and Chokchai (Box) Leangsuksun
                   and George Ostrouchov
                   and Stephen L. Scott
                   and Christian Engelmann",
  title         = "{Blue Gene/L} Log Analysis and Time to Interrupt Estimation",
  booktitle     = "Proceedings of the
                   \href{http://www.ares-conference.eu/ares2009}{$4^{th}$
                   International Conference on Availability, Reliability and
                   Security (ARES) 2009}",
  pages         = "173--180",
  month         = mar # "~16-19, ",
  year          = "2009",
  address       = "Fukuoka, Japan",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-1-4244-3572-2",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/ARES.2009.105",
  url           = "http://www.christian-engelmann.info/publications/taerat09blue.pdf",
  url2          = "",
  abstract      = "System- and application-level failures could be characterized
                   by analyzing relevant log files. The resulting data might
                   then be used in numerous studies on and future developments
                   for the mission-critical and large scale computational
                   architecture, including fields such as failure prediction,
                   reliability modeling, performance modeling and power
                   awareness. In this paper, system logs covering a six month
                   period of the Blue Gene/L supercomputer were obtained and
                   subsequently analyzed. Temporal filtering was applied to
                   remove duplicated log messages. Optimistic and pessimistic
                   perspectives were exerted on filtered log information to
                   observe failure behavior within the system. Further, various
                   time to repair factors were applied to obtain application
                   time to interrupt, which will be exploited in further
                   resilience modeling research.",
  note          = "Acceptance rate 25.0\% (40/160)"
}
@conference{engelmann09evaluating,
  author        = "Christian Engelmann
                   and Hong H. Ong
                   and Stephen L. Scott",
  title         = "Evaluating the Shared Root File System Approach for Diskless
                   High-Performance Computing Systems",
  booktitle     = "Proceedings of the
                   \href{http://www.linuxclustersinstitute.org/conferences}
                   {$10^{th}$ LCI International Conference on High-Performance
                   Clustered Computing (LCI) 2009}",
  month         = mar # "~9-12, ",
  year          = "2009",
  address       = "Boulder, CO, USA",
  url           = "http://www.christian-engelmann.info/publications/engelmann09evaluating.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann09evaluating.ppt.pdf",
  abstract      = "Diskless high-performance computing (HPC) systems utilizing
                   networked storage have become popular in the last several
                   years. Removing disk drives significantly increases compute
                   node reliability as they are known to be a major source of
                   failures. Furthermore, networked storage solutions utilizing
                   parallel I/O and replication are able to provide increased
                   scalability and availability. Reducing a compute node to
                   processor(s), memory and network interface(s) greatly reduces
                   its physical size, which in turn allows for large-scale dense
                   HPC solutions. However, one major obstacle is the requirement
                   by certain operating systems (OSs), such as Linux, for a root
                   file system. While one solution is to remove this requirement
                   from the OS, another is to share the root file system over
                   the networked storage. This paper evaluates three networked
                   file system solutions, NFSv4, Lustre and PVFS2, with respect
                   to their performance, scalability, and availability features
                   for servicing a common root file system in a diskless HPC
                   configuration. Our findings indicate that Lustre is a viable
                   solution as it meets both, scaling and performance
                   requirements. However, certain availability issues regarding
                   single points of failure and control need to be considered."
}
@conference{engelmann09proactive,
  author        = "Christian Engelmann
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Stephen L. Scott",
  title         = "Proactive Fault Tolerance Using Preemptive Migration",
  booktitle     = "Proceedings of the \href{http://www.pdp2009.org}{$17^{th}$
                   Euromicro International Conference on Parallel, Distributed,
                   and network-based Processing (PDP) 2009}",
  pages         = "252--257",
  month         = feb # "~18-20, ",
  year          = "2009",
  address       = "Weimar, Germany",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-3544-9",
  issn          = "1066-6192",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/PDP.2009.31",
  url           = "http://www.christian-engelmann.info/publications/engelmann09proactive.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann09proactive.ppt.pdf",
  abstract      = "Proactive fault tolerance (FT) in high-performance computing
                   is a concept that prevents compute node failures from
                   impacting running parallel applications by preemptively
                   migrating application parts away from nodes that are about
                   to fail. This paper provides a foundation for proactive FT by
                   defining its architecture and classifying implementation
                   options. This paper further relates prior work to the
                   presented architecture and classification, and discusses the
                   challenges ahead for needed supporting technologies.",
  note          = "Acceptance rate 42.0\% (58/138)"
}
@conference{valentini09high,
  author        = "Alessandro Valentini
                   and Christian Di Biagio
                   and Fabrizio Batino
                   and Guido Pennella
                   and Fabrizio Palma
                   and Christian Engelmann",
  title         = "High Performance Computing with {Harness} over {InfiniBand}",
  booktitle     = "Proceedings of the \href{http://www.pdp2009.org}{$17^{th}$
                   Euromicro International Conference on Parallel, Distributed,
                   and network-based Processing (PDP) 2009}",
  pages         = "151--154",
  month         = feb # "~18-20, ",
  year          = "2009",
  address       = "Weimar, Germany",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-3544-9",
  issn          = "1066-6192",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/PDP.2009.64",
  url           = "http://www.christian-engelmann.info/publications/valentini09high.pdf",
  abstract      = "Harness is an adaptable and plug-in-based middleware
                   framework able to support distributed parallel computing. By
                   now, it is based on the Ethernet protocol which cannot
                   guarantee high performance throughput and Real Time
                   (determinism) performance. During last years, both the
                   research and industry environments have developed both new
                   network architectures (InfiniBand, Myrinet, iWARP, etc.) to
                   avoid those limits. This paper concerns the integration
                   between Harness and InfiniBand focusing on two solutions: IP
                   over InfiniBand (IPoIB) and Socket Direct Protocol (SDP)
                   technology. Those allow Harness middleware to take advantage
                   of the enhanced features provided by InfiniBand.",
  note          = "Acceptance rate 42.0\% (58/138)"
}
@conference{engelmann09case,
  author        = "Christian Engelmann
                   and Hong H. Ong
                   and Stephen L. Scott",
  title         = "The Case for Modular Redundancy in Large-Scale High
                   Performance Computing Systems",
  booktitle     = "Proceedings of the
                   \href{http://www.iasted.org/conferences/home-641.html}
                   {$8^{th}$ IASTED International Conference on Parallel and
                   Distributed Computing and Networks (PDCN) 2009}",
  pages         = "189--194",
  month         = feb # "~16-18, ",
  year          = "2009",
  address       = "Innsbruck, Austria",
  publisher     = "\href{http://www.actapress.com}{ACTA Press, Calgary, AB,
                   Canada}",
  isbn          = "978-0-88986-784-0",
  doi           = "http://www.actapress.com/Abstract.aspx?paperId=34612",
  url           = "http://www.christian-engelmann.info/publications/engelmann09case.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann09case.ppt.pdf",
  abstract      = "Recent investigations into resilience of large-scale
                   high-performance computing (HPC) systems showed a continuous
                   trend of decreasing reliability and availability. Newly
                   installed systems have a lower mean-time to failure (MTTF)
                   and a higher mean-time to recover (MTTR) than their
                   predecessors. Modular redundancy is being used in many
                   mission critical systems today to provide for resilience,
                   such as for aerospace and command & control systems. The
                   primary argument against modular redundancy for resilience
                   in HPC has always been that the capability of a HPC system,
                   and respective return on investment, would be significantly
                   reduced. We argue that modular redundancy can significantly
                   increase compute node availability as it removes the impact
                   of scale from single compute node MTTR. We further argue that
                   single compute nodes can be much less reliable, and therefore
                   less expensive, and still be highly available, if their
                   MTTR/MTTF ratio is maintained."
}
@conference{wang08proactive,
  author        = "Chao Wang
                   and Frank Mueller
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Proactive Process-Level Live Migration in {HPC}
                   Environments",
  booktitle     = "Proceedings of the \href{http://sc08.supercomputing.org}
                   {$21^{st}$ IEEE/ACM International Conference on High
                   Performance Computing, Networking, Storage and Analysis (SC)
                   2008}",
  pages         = "1--12",
  month         = nov # "~15-21, ",
  year          = "2008",
  address       = "Austin, TX, USA",
  publisher     = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
  isbn          = "978-1-4244-2835-9",
  doi           = "http://doi.acm.org/10.1145/1413370.1413414",
  url           = "http://www.christian-engelmann.info/publications/wang08proactive.pdf",
  url2          = "http://www.christian-engelmann.info/publications/wang08proactive.ppt.pdf",
  abstract      = "As the number of nodes in high-performance computing
                   environments keeps increasing, faults are becoming common
                   place. Reactive fault tolerance (FT) often does not scale due
                   to massive I/O requirements and relies on manual job
                   resubmission. This work complements reactive with proactive
                   FT at the process level. Through health monitoring, a subset
                   of node failures can be anticipated when one's health
                   deteriorates. A novel process-level live migration mechanism
                   supports continued execution of applications during much of
                   processes migration. This scheme is integrated into an MPI
                   execution environment to transparently sustain
                   health-inflicted node failures, which eradicates the need to
                   restart and requeue MPI jobs. Experiments indicate that 1-6.5
                   seconds of prior warning are required to successfully trigger
                   live process migration while similar operating system
                   virtualization mechanisms require 13-24 seconds. This
                   self-healing approach complements reactive FT by nearly
                   cutting the number of checkpoints in half when 70\% of the
                   faults are handled proactively.",
  note          = "Acceptance rate 21.3\% (59/277)"
}
@conference{engelmann08symmetric,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Chokchai (Box) Leangsuksun
                   and Xubin (Ben) He",
  title         = "Symmetric Active/Active Replication for Dependent Services",
  booktitle     = "Proceedings of the
                   \href{http://www.ares-conference.eu/ares2008}{$3^{rd}$
                   International Conference on Availability, Reliability and
                   Security (ARES) 2008}",
  pages         = "260--267",
  month         = mar # "~4-7, ",
  year          = "2008",
  address       = "Barcelona, Spain",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-3102-1",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/ARES.2008.64",
  url           = "http://www.christian-engelmann.info/publications/engelmann08symmetric.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann08symmetric.ppt.pdf",
  abstract      = "During the last several years, we have established the
                   symmetric active/active replication model for service-level
                   high availability and implemented several proof-of-concept
                   prototypes. One major deficiency of our model is its
                   inability to deal with dependent services, since its original
                   architecture is based on the client-service model. This paper
                   extends our model to dependent services using its already
                   existing mechanisms and features. The presented concept is
                   based on the idea that a service may also be a client of
                   another service, and multiple services may be clients of each
                   other. A high-level abstraction is used to illustrate
                   dependencies between clients and services, and to decompose
                   dependencies between services into respective client-service
                   dependencies. This abstraction may be used for providing
                   high availability in distributed computing systems with
                   complex service-oriented architectures.",
  note          = "Acceptance rate 21.1\% (40/190)"
}
@conference{vallee08framework,
  author        = "Geoffroy R. Vall\'ee
                   and Kulathep Charoenpornwattana
                   and Christian Engelmann
                   and Anand Tikotekar
                   and Chokchai (Box) Leangsuksun
                   and Thomas Naughton
                   and Stephen L. Scott",
  title         = "A Framework For Proactive Fault Tolerance",
  booktitle     = "Proceedings of the
                   \href{http://www.ares-conference.eu/ares2008}{$3^{rd}$
                   International Conference on Availability, Reliability and
                   Security (ARES) 2008}",
  pages         = "659--664",
  month         = mar # "~4-7, ",
  year          = "2008",
  address       = "Barcelona, Spain",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-3102-1",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/ARES.2008.171",
  url           = "http://www.christian-engelmann.info/publications/vallee08framework.pdf",
  url2          = "http://www.christian-engelmann.info/publications/vallee08framework.ppt.pdf",
  abstract      = "Fault tolerance is a major concern to guarantee availability
                   of critical services as well as application execution.
                   Traditional approaches for fault tolerance include
                   checkpoint/restart or duplication. However it is also
                   possible to anticipate failures and proactively take action
                   before failures occur in order to minimize failure impact on
                   the system and application execution. This document presents
                   a proactive fault tolerance framework. This framework can use
                   different proactive fault tolerance mechanisms, i.e.
                   migration and pause/unpause. The framework also allows the
                   implementation of new proactive fault tolerance policies
                   thanks to a modular architecture. A first proactive fault
                   tolerance policy has been implemented and preliminary
                   experimentations have been done based on system-level
                   virtualization and compared with results obtained by
                   simulation.",
  note          = "Acceptance rate 21.1\% (40/190)"
}
@conference{koenning08virtualized,
  author        = "Bj{\"o}rn K{\"o}nning
                   and Christian Engelmann
                   and Stephen L. Scott
                   and George A. (Al) Geist",
  title         = "Virtualized Environments for the {Harness} High Performance
                   Computing Workbench",
  booktitle     = "Proceedings of the \href{http://www.pdp2008.org}{$16^{th}$
                   Euromicro International Conference on Parallel, Distributed,
                   and network-based Processing (PDP) 2008}",
  pages         = "133--140",
  month         = feb # "~13-15, ",
  year          = "2008",
  address       = "Toulouse, France",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-3089-5",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/PDP.2008.14",
  url           = "http://www.christian-engelmann.info/publications/koenning08virtualized.pdf",
  url2          = "http://www.christian-engelmann.info/publications/koenning08virtualized.ppt.pdf",
  abstract      = "This paper describes recent accomplishments in providing a
                   virtualized environment concept and prototype for scientific
                   application development and deployment as part of the Harness
                   High Performance Computing (HPC) Workbench research effort.
                   The presented work focuses on tools and mechanisms that
                   simplify scientific application development and deployment
                   tasks, such that only minimal adaptation is needed when
                   moving from one HPC system to another or after HPC system
                   upgrades. The overall technical approach focuses on the
                   concept of adapting the HPC system environment to the actual
                   needs of individual scientific applications instead of the
                   traditional scheme of adapting scientific applications to
                   individual HPC system environment properties. The presented
                   prototype implementation is based on the mature and
                   lightweight chroot virtualization approach for Unix-type
                   systems with a focus on virtualized file system structure
                   and virtualized shell environment variables utilizing
                   virtualized environment configuration descriptions in
                   Extensible Markup Language (XML) format. The presented work
                   can be easily extended to other virtualization technologies,
                   such as system-level virtualization solutions using
                   hypervisors.",
  note          = "Acceptance rate 40\% (83/207)"
}
@conference{vallee08system,
  author        = "Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Christian Engelmann
                   and Hong H. Ong
                   and Stephen L. Scott",
  title         = "System-level Virtualization for High Performance Computing",
  booktitle     = "Proceedings of the \href{http://www.pdp2008.org}{$16^{th}$
                   Euromicro International Conference on Parallel, Distributed,
                   and network-based Processing (PDP) 2008}",
  pages         = "636--643",
  month         = feb # "~13-15, ",
  year          = "2008",
  address       = "Toulouse, France",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-3089-5",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/PDP.2008.85",
  url           = "http://www.christian-engelmann.info/publications/vallee08system.pdf",
  url2          = "http://www.christian-engelmann.info/publications/vallee08system.ppt.pdf",
  abstract      = "System-level virtualization has been a research topic since
                   the 70`s but regained popularity during the past few years
                   because of the availability of efficient solution such as Xen
                   and the implementation of hardware support in commodity
                   processors (e.g. Intel-VT, AMD-V). However, a majority of
                   system-level virtualization projects is guided by the server
                   consolidation market. As a result, current virtualization
                   solutions appear to not be suitable for high performance
                   computing (HPC) which is typically based on large-scale
                   systems. On another hand there is significant interest in
                   exploiting virtual machines (VMs) within HPC for a number of
                   other reasons. By virtualizing the machine, one is able to
                   run a variety of operating systems and environments as needed
                   by the applications. Virtualization allows users to isolate
                   workloads, improving security and reliability. It is also
                   possible to support non-native environments and/or legacy
                   operating environments through virtualization. In addition,
                   it is possible to balance work loads, use migration
                   techniques to relocate applications from failing machines,
                   and isolate fault systems for repair. This document presents
                   the challenges for the implementation of a system-level
                   virtualization solution for HPC. It also presents a brief
                   survey of the different approaches and techniques to address
                   these challenges.",
  note          = "Acceptance rate 40\% (83/207)"
}
@conference{ou07symmetric,
  author        = "Li Ou
                   and Christian Engelmann
                   and Xubin (Ben) He
                   and Xin Chen
                   and Stephen L. Scott",
  title         = "Symmetric Active/Active Metadata Service for Highly Available
                   Cluster Storage Systems",
  booktitle     = "Proceedings of the
                   \href{http://www.iasted.org/conferences/home-590.html}
                   {$19^{th}$ IASTED International Conference on Parallel and
                   Distributed Computing and Systems (PDCS) 2007}",
  pages         = "",
  month         = nov # "~19-21, ",
  year          = "2007",
  address       = "Cambridge, MA, USA",
  publisher     = "\href{http://www.actapress.com}{ACTA Press, Calgary, AB,
                   Canada}",
  isbn          = "978-0-88986-703-1",
  doi           = "http://www.actapress.com/Abstract.aspx?paperId=32008",
  url           = "http://www.christian-engelmann.info/publications/ou07symmetric.pdf",
  url2          = "http://www.christian-engelmann.info/publications/ou07symmetric.ppt.pdf",
  abstract      = "In a typical distributed storage system, metadata is stored
                   and managed by dedicated metadata servers. One way to improve
                   the availability of distributed storage systems is to deploy
                   multiple metadata servers. Past research focused on the
                   active/standby model, where each active server has at least
                   one redundant idle backup. However, interruption of service
                   and loss of service state may occur during a fail-over
                   depending on the used replication technique. The research in
                   this paper targets the symmetric active/active replication
                   model using multiple redundant service nodes running in
                   virtual synchrony. In this model, service node failures do
                   not cause a fail-over to a backup and there is no disruption
                   of service or loss of service state. We propose a fast
                   delivery protocol to reduce the latency of total order
                   broadcast. Our prototype implementation shows that high
                   availability of metadata servers can be achieved with an
                   acceptable performance trade-off using the active/active
                   metadata server solution.",
  note          = "Acceptance rate 49\%"
}
@conference{disaverio07distributed,
  author        = "Emanuele Di Saverio
                   and Marco Cesati
                   and Christian Di Biagio
                   and Guido Pennella
                   and Christian Engelmann",
  title         = "Distributed Real-Time Computing with {Harness}",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://pvmmpi07.lri.fr}{$14^{th}$ European PVM/MPI
                   Users` Group Meeting (EuroPVM/MPI) 2007}",
  pages         = "281--288",
  volume        = "4757",
  month         = sep # "~30 - " # oct # "~3, ",
  year          = "2007",
  address       = "Paris, France",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "978-3-540-75415-2",
  issn          = "0302-9743",
  doi           = "http://dx.doi.org/10.1007/978-3-540-75416-9_39",
  url           = "http://www.christian-engelmann.info/publications/disaverio07distributed.pdf",
  url2          = "http://www.christian-engelmann.info/publications/disaverio07distributed.ppt.pdf",
  abstract      = "Modern parallel and distributed computing solutions are often
                   built onto a middleware software layer providing a higher
                   and common level of service between computational nodes.
                   Harness is an adaptable, plugin-based middleware framework
                   for parallel and distributed computing. This paper reports
                   recent research and development results of using Harness for
                   real-time distributed computing applications in the context
                   of an industrial environment with the needs to perform
                   several safety critical tasks. The presented work exploits
                   the modular architecture of Harness in conjunction with a
                   lightweight threaded implementation to resolve several
                   real-time issues by adding three new Harness plug-ins to
                   provide a prioritized lightweight execution environment, low
                   latency communication facilities, and local timestamped event
                   logging."
}
@conference{ou07fast,
  author        = "Li Ou
                   and Xubin (Ben) He
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "A Fast Delivery Protocol for Total Order Broadcasting",
  booktitle     = "Proceedings of the \href{http://www.icccn.org/icccn07}
                   {$16^{th}$ IEEE International Conference on Computer
                   Communications and Networks (ICCCN) 2007}",
  pages         = "730--734",
  month         = aug # "~13-16, ",
  year          = "2007",
  address       = "Honolulu, HI, USA",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-1-42441-251-8",
  issn          = "1095-2055",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/ICCCN.2007.4317904",
  url           = "http://www.christian-engelmann.info/publications/ou07fast.pdf",
  url2          = "http://www.christian-engelmann.info/publications/ou07fast.ppt.pdf",
  abstract      = "Sequencer, privilege-based, and communication history
                   algorithms are popular approaches to implement total
                   ordering, where communication history algorithms are most
                   suitable for parallel computing systems, because they provide
                   best performance under heavy work load. Unfortunately,
                   post-transmission delay of communication history algorithms
                   is most apparent when a system is idle. In this paper, we
                   propose a fast delivery protocol to reduce the latency of
                   message ordering. The protocol optimizes the total ordering
                   process by waiting for messages only from a subset of the
                   machines in the group, and by fast acknowledging messages on
                   behalf of other machines. Our test results indicate that the
                   fast delivery protocol is suitable for both idle and heavy
                   load systems, while reducing the latency of message
                   ordering.",
  note          = "Acceptance rate 29.1\% (160/550)"
}
@conference{nagarajan07proactive,
  author        = "Arun B. Nagarajan
                   and Frank Mueller
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Proactive Fault Tolerance for {HPC} with {Xen}
                   Virtualization",
  booktitle     = "Proceedings of the \href{http://ics07.ac.upc.edu}{$21^{st}$
                   ACM International Conference on Supercomputing (ICS) 2007}",
  pages         = "23--32",
  month         = jun # "~16-20, ",
  year          = "2007",
  address       = "Seattle, WA, USA",
  publisher     = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
  isbn          = "978-1-59593-768-1",
  doi           = "http://doi.acm.org/10.1145/1274971.1274978",
  url           = "http://www.christian-engelmann.info/publications/nagarajan07proactive.pdf",
  url2          = "http://www.christian-engelmann.info/publications/nagarajan07proactive.ppt.pdf",
  abstract      = "Large-scale parallel computing is relying increasingly on
                   clusters with thousands of processors. At such large counts
                   of compute nodes, faults are becoming common place. Current
                   techniques to tolerate faults focus on reactive schemes to
                   recover from faults and generally rely on a
                   checkpoint/restart mechanism. Yet, in today`s systems, node
                   failures can often be anticipated by detecting a
                   deteriorating health status. Instead of a reactive scheme for
                   fault tolerance (FT), we are promoting a proactive one where
                   processes automatically migrate from unhealthy nodes to
                   healthy ones. Our approach relies on operating system
                   virtualization techniques exemplified by but not limited to
                   Xen. This paper contributes an automatic and transparent
                   mechanism for proactive FT for arbitrary MPI applications.
                   It leverages virtualization techniques combined with health
                   monitoring and load-based migration. We exploit Xen`s live
                   migration mechanism for a guest operating system (OS) to
                   migrate an MPI task from a health-deteriorating node to a
                   healthy one without stopping the MPI task during most of the
                   migration. Our proactive FT daemon orchestrates the tasks of
                   health monitoring, load determination and initiation of guest
                   OS migration. Experimental results demonstrate that live
                   migration hides migration costs and limits the overhead to
                   only a few seconds making it an attractive approach to
                   realize FT in HPC systems. Overall, our enhancements make
                   proactive FT a valuable asset for long-running MPI
                   application that is complementary to reactive FT using full
                   checkpoint/restart schemes since checkpoint frequencies can
                   be reduced as fewer unanticipated failures are encountered.
                   In the context of OS virtualization, we believe that this is
                   the first comprehensive study of proactive fault tolerance
                   where live migration is actually triggered by health
                   monitoring.",
  note          = "Acceptance rate 23.6\% (29/123). Most cited paper with 178 citations"
}
@conference{engelmann07programming,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Chokchai (Box) Leangsuksun
                   and Xubin (Ben) He",
  title         = "On Programming Models for Service-Level High Availability",
  booktitle     = "Proceedings of the
                   \href{http://www.ares-conference.eu/ares2007}{$2^{nd}$
                   International Conference on Availability, Reliability and
                   Security (ARES) 2007}",
  pages         = "999--1006",
  month         = apr # "~10-13, ",
  year          = "2007",
  address       = "Vienna, Austria",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "0-7695-2775-2",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/ARES.2007.109",
  url           = "http://www.christian-engelmann.info/publications/engelmann07programming.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann07programming.ppt.pdf",
  abstract      = "This paper provides an overview of existing programming
                   models for service-level high availability and investigates
                   their differences, similarities, advantages, and
                   disadvantages. Its goal is to help to improve reuse of code
                   and to allow adaptation to quality of service requirements by
                   using a uniform programming model description. It further
                   aims at encouraging a discussion about these programming
                   models and their provided quality of service, such as
                   availability, performance, serviceability, usability, and
                   applicability. Within this context, the presented research
                   focuses on providing high availability for services running
                   on head and service nodes of high-performance computing
                   systems.",
  note          = "Acceptance rate 28.3\% (60/212)"
}
@conference{wang07job,
  author        = "Chao Wang
                   and Frank Mueller
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "A Job Pause Service under {LAM/MPI+BLCR} for Transparent
                   Fault Tolerance",
  booktitle     = "Proceedings of the \href{http://www.ipdps.org/ipdps2007}
                   {$21^{st}$ IEEE International Parallel and Distributed
                   Processing Symposium (IPDPS) 2007}",
  pages         = "1-10",
  month         = mar # "~26-30, ",
  year          = "2007",
  address       = "Long Beach, CA, USA",
  publisher     = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
  isbn          = "978-1-59593-768-1",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/IPDPS.2007.370307",
  url           = "http://www.christian-engelmann.info/publications/wang07job.pdf",
  url2          = "http://www.christian-engelmann.info/publications/wang07job.ppt.pdf",
  abstract      = "Checkpoint/restart (C/R) has become a requirement for
                   long-running jobs in large-scale clusters due to a
                   mean-time-to-failure (MTTF) in the order of hours. After a
                   failure, C/R mechanisms generally require a complete restart
                   of an MPI job from the last checkpoint. A complete restart,
                   however, is unnecessary since all but one node are typically
                   still alive. Furthermore, a restart may result in lengthy job
                   requeuing even though the original job had not exceeded its
                   time quantum. In this paper, we overcome these shortcomings.
                   Instead of job restart, we have developed a transparent
                   mechanism for job pause within LAM/MPI+BLCR. This mechanism
                   allows live nodes to remain active and roll back to the last
                   checkpoint while failed nodes are dynamically replaced by
                   spares before resuming from the last checkpoint. Our
                   methodology includes LAM/MPI enhancements in support of
                   scalable group communication with fluctuating number of
                   nodes, reuse of network connections, transparent coordinated
                   checkpoint scheduling and a BLCR enhancement for job pause.
                   Experiments in a cluster with the NAS Parallel Benchmark
                   suite show that our overhead for job pause is comparable to
                   that of a complete job restart. A minimal overhead of 5.6\%
                   is only incurred in case migration takes place while the
                   regular checkpoint overhead remains unchanged. Yet, our
                   approach alleviates the need to reboot the LAM run-time
                   environment, which accounts for considerable overhead
                   resulting in net savings of our scheme in the experiments.
                   Our solution further provides full transparency and
                   automation with the additional benefit of reusing existing
                   resources. Executing continues after failures within the
                   scheduled job, {\em \textit{i.e.}}, the application staging
                   overhead is not incurred again in contrast to a restart.
                   Our scheme offers additional potential for savings through
                   incremental checkpointing and proactive diskless live
                   migration, which we are currently working on.",
  note          = "Acceptance rate 26\% (109/419)"
}
@conference{uhlemann06joshua,
  author        = "Kai Uhlemann
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "{JOSHUA}: {S}ymmetric Active/Active Replication for Highly
                   Available {HPC} Job and Resource Management",
  booktitle     = "Proceedings of the \href{http://cluster2006.org}{$8^{th}$
                   IEEE International Conference on Cluster Computing (Cluster)
                   2006}",
  pages         = "1-10",
  month         = sep # "~25-28, ",
  year          = "2006",
  address       = "Barcelona, Spain",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "1-4244-0328-6",
  issn          = "1552-5244",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/CLUSTR.2006.311855",
  url           = "http://www.christian-engelmann.info/publications/uhlemann06joshua.pdf",
  url2          = "http://www.christian-engelmann.info/publications/uhlemann06joshua.ppt.pdf",
  abstract      = "Most of today`s HPC systems employ a single head node for
                   control, which represents a single point of failure as it
                   interrupts an entire HPC system upon failure. Furthermore, it
                   is also a single point of control as it disables an entire
                   HPC system until repair. One of the most important HPC system
                   service running on the head node is the job and resource
                   management. If it goes down, all currently running jobs loose
                   the service they report back to. They have to be restarted
                   once the head node is up and running again. With this paper,
                   we present a generic approach for providing symmetric
                   active/active replication for highly available HPC job and
                   resource management. The JOSHUA solution provides a virtually
                   synchronous environment for continuous availability without
                   any interruption of service and without any loss of state.
                   Replication is performed externally via the PBS service
                   interface without the need to modify any service code. Test
                   results as well as availability analysis of our
                   proof-of-concept prototype implementation show that
                   continuous availability can be provided by JOSHUA with an
                   acceptable performance trade-off.",
  note          = "Acceptance rate 33.1\% (42/127)"
}
@conference{baumann06parallel,
  author        = "Ronald Baumann
                   and Christian Engelmann
                   and George A. (Al) Geist",
  title         = "A Parallel Plug-in Programming Paradigm",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://hpcc06.lrr.in.tum.de}{$7^{th}$ International
                   Conference on High Performance Computing and Communications
                   (HPCC) 2006}",
  volume        = "4208",
  pages         = "823--832",
  month         = sep # "~13-15, ",
  year          = "2006",
  address       = "Munich, Germany",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "978-3-540-39368-9",
  issn          = "0302-9743",
  doi           = "http://dx.doi.org/10.1007/11847366_85",
  url           = "http://www.christian-engelmann.info/publications/baumann06parallel.pdf",
  url2          = "http://www.christian-engelmann.info/publications/baumann06parallel.ppt.pdf",
  abstract      = "Software component architectures allow assembly of
                   applications from individual software modules based on
                   clearly defined programming interfaces, thus improving the
                   reuse of existing solutions and simplifying application
                   development. Furthermore, the plug-in programming paradigm
                   additionally enables runtime reconfigurability, making it
                   possible to adapt to changing application needs, such as
                   different application phases, and system properties, like
                   resource availability, by loading/unloading appropriate
                   software modules. Similar to parallel programs, parallel
                   plug-ins are an abstraction for a set of cooperating
                   individual plug-ins within a parallel application utilizing
                   a software component architecture. Parallel programming
                   paradigms apply to parallel plug-ins in the same way they
                   apply to parallel programs. The research presented in this
                   paper targets the clear definition of parallel plug-ins and
                   the development of a parallel plug-in programming paradigm."
}
@conference{varma06scalable,
  author        = "Jyothish Varma
                   and Chao Wang
                   and Frank Mueller
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Scalable, Fault-Tolerant Membership for {MPI} Tasks on {HPC}
                   Systems",
  booktitle     = "Proceedings of the \href{http://www.ics-conference.org/2006}
                   {$20^{th}$ ACM International Conference on Supercomputing
                   (ICS) 2006}",
  pages         = "219--228",
  month         = jun # "~28-30, ",
  year          = "2006",
  address       = "Cairns, Australia",
  publisher     = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
  doi           = "http://doi.acm.org/10.1145/1183401.1183433",
  isbn          = "1-59593-282-8",
  url           = "http://www.christian-engelmann.info/publications/varma06scalable.pdf",
  url2          = "http://www.christian-engelmann.info/publications/varma06scalable.ppt.pdf",
  abstract      = "Reliability is increasingly becoming a challenge for
                   high-performance computing (HPC) systems with thousands of
                   nodes, such as IBM`s Blue Gene/L. A shorter
                   mean-time-to-failure can be addressed by adding fault
                   tolerance to reconfigure working nodes to ensure that
                   communication and computation can progress. However, existing
                   approaches fall short in providing scalability and small
                   reconfiguration overhead within the fault-tolerant layer.
                   This paper contributes a scalable approach to reconfigure the
                   communication infrastructure after node failures. We propose
                   a decentralized (peer-to-peer) protocol that maintains a
                   consistent view of active nodes in the presence of faults.
                   Our protocol shows response times in the order of hundreds of
                   microseconds and single-digit milliseconds for 
                   reconfiguration using MPI over Blue Gene/L and TCP over 
                   Gigabit, respectively. The protocol can be adapted to match
                   the network topology to further increase performance. We also
                   verify experimental results against a performance model,
                   which demonstrates the scalability of the approach. Hence,
                   the membership service is suitable for deployment in the
                   communication layer of MPI runtime systems, and we have
                   integrated an early version into LAM/MPI.",
  note          = "Acceptance rate 26.2\% (37/141)"
}
@conference{okunbor06exploring,
  author        = "Daniel I. Okunbor
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Exploring Process Groups for Reliability, Availability and
                   Serviceability of Terascale Computing Systems",
  booktitle     = "Proceedings of the
                   \href{http://www.atiner.gr/docs/2006AAAPROGRAM_COMP.htm}
                   {$2^{nd}$ International Conference on Computer Science and
                   Information Systems 2006}",
  month         = jun # "~19-21, ",
  year          = "2006",
  address       = "Athens, Greece",
  url           = "http://www.christian-engelmann.info/publications/okunbor06exploring.pdf",
  abstract      = "This paper presents various aspects of reliability,
                   availability and serviceability (RAS) systems as they relate
                   to group communication service, including reliable and total
                   order multicast/broadcast, virtual synchrony, and failure
                   detection. While the issue of availability, particularly
                   high availability using replication-based architectures has
                   recently received upsurge research interests, much still have
                   to be done in understanding the basic underlying concepts for
                   achieving RAS systems, especially in high-end and high
                   performance computing (HPC) communities. Various attributes
                   of group communication service and the prototype of symmetric
                   active replication following ideas utilized in the Newtop
                   protocol will be discussed. We explore the application of
                   group communication service for RAS HPC, laying the
                   groundwork for its integrated model."
}
@conference{limaye05jobsite,
  author        = "Kshitij Limaye
                   and Chokchai (Box) Leangsuksun
                   and Zeno Greenwood
                   and Stephen L. Scott
                   and Christian Engelmann
                   and Richard M. Libby
                   and Kasidit Chanchio",
  title         = "Job-Site Level Fault Tolerance for Cluster and {Grid}
                   Environments",
  booktitle     = "Proceedings of the \href{http://cluster2005.org}{$7^{th}$
                   IEEE International Conference on Cluster Computing (Cluster)
                   2005}",
  pages         = "1--9",
  month         = sep # "~26-30, ",
  year          = "2005",
  address       = "Boston, MA, USA",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "0-7803-9486-0",
  issn          = "1552-5244",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/CLUSTR.2005.347043",
  url           = "http://www.christian-engelmann.info/publications/limaye05job-site.pdf",
  abstract      = "In order to adopt high performance clusters and Grid
                   computing for mission critical applications, fault tolerance
                   is a necessity. Common fault tolerance techniques in
                   distributed systems are normally achieved with
                   checkpoint-recovery and job replication on alternative
                   resources, in cases of a system outage. The first approach
                   depends on the system`s MTTR while the latter approach
                   depends on the availability of alternative sites to run
                   replicas. There is a need for complementing these approaches
                   by proactively handling failures at a job-site level,
                   ensuring the system high availability with no loss of user
                   submitted jobs. This paper discusses a novel fault tolerance
                   technique  that enables the job-site recovery in Beowulf
                   cluster-based grid environments, whereas existing techniques
                   give up a failed system by seeking alternative resources.
                   Our results suggest sizable aggregate performance improvement
                   during an implementation of our method in Globus-enabled
                   HA-OSCAR. The technique called Smart Failover provides a
                   transparent and graceful recovery mechanism that saves job
                   states in a local job-manager queue and transfers those
                   states to the backup server periodically, and in critical
                   system events. Thus whenever a failover occurs, the backup
                   server is able to restart the jobs from their last saved
                   state.",
  note          = "Acceptance rate 39.6\% (45/138)"
}
@conference{song05umlbased,
  author        = "Hertong Song
                   and Chokchai (Box) Leangsuksun
                   and Raja Nassar
                   and Yudan Liu
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "{UML-based} {Beowulf} Cluster Availability Modeling",
  booktitle     = "\href{http://www.world-academy-of-science.org/IMCSE2005/ws/SERP}
                   {International Conference on Software Engineering Research
                   and Practice (SERP) 2005}",
  pages         = "161--167",
  month         = jun # "~27-30, ",
  year          = "2005",
  address       = "Las Vegas, NV, USA",
  publisher     = "CSREA Press",
  isbn          = "1-932415-49-1"
}
@conference{engelmann05superscalable,
  author        = "Christian Engelmann
                   and George A. (Al) Geist",
  title         = "Super-Scalable Algorithms for Computing on 100,000
                   Processors",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://www.iccs-meeting.org/iccs2005}{$5^{th}$
                   International Conference on Computational Science (ICCS)
                   2005}, Part I",
  volume        = "3514",
  pages         = "313--320",
  month         = may # "~22-25, ",
  year          = "2005",
  address       = "Atlanta, GA, USA",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "978-3-540-26032-5",
  issn          = "0302-9743",
  doi           = "http://dx.doi.org/10.1007/11428831_39",
  url           = "http://www.christian-engelmann.info/publications/engelmann05superscalable.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann05superscalable.ppt.pdf",
  abstract      = "In the next five years, the number of processors in high-end
                   systems for scientific computing is expected to rise to tens
                   and even hundreds of thousands. For example, the IBM Blue
                   Gene/L can have up to 128,000 processors and the delivery of
                   the first system is scheduled for 2005. Existing deficiencies
                   in scalability and fault-tolerance of scientific applications
                   need to be addressed soon. If the number of processors grows
                   by a magnitude and efficiency drops by a magnitude, the
                   overall effective computing performance stays the same.
                   Furthermore, the mean time to interrupt of high-end computer
                   systems decreases with scale and complexity. In a
                   100,000-processor system, failures may occur every couple of
                   minutes and traditional checkpointing may no longer be
                   feasible. With this paper, we summarize our recent research
                   in super-scalable algorithms for computing on 100,000
                   processors. We introduce the algorithm properties of scale
                   invariance and natural fault tolerance, and discuss how they
                   can be applied to two different classes of algorithms. We
                   also describe a super-scalable diskless checkpointing
                   algorithm for problems that can`t be transformed into a
                   super-scalable variant, or where other solutions are more
                   efficient. Finally, a 100,000-processor simulator is
                   presented as a platform for testing and experimentation.",
  note          = "Acceptance rate 35\%"
}
@conference{naughton14what,
  author        = "Thomas Naughton
                   and Garry Smith
                   and Christian Engelmann
                   and Geoffroy Vall{\'e}e
                   and Ferrol Aderholdt
                   and Stephen L. Scott",
  title         = "What is the right balance for performance and isolation with
                   virtualization in {HPC}?",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://europar2014.dcc.fc.up.pt}{$20^{th}$
                   European Conference on Parallel and Distributed Computing
                   (Euro-Par) 2014 Workshops}:
                   \href{http://xcr.cenit.latech.edu/resilience2014}{$7^{th}$
                   Workshop on Resiliency in High Performance Computing
                   (Resilience) in Clusters, Clouds, and Grids}",
  pages         = "",
  month         = aug # "~25, ",
  year          = "2014",
  address       = "Porto, Portugal",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "",
  issn          = "",
  doi           = "",
  url           = "",
  url2          = "",
  abstract      = "",
  note          = "To appear"
}
@conference{engelmann13toward,
  author        = "Christian Engelmann
                   and Thomas Naughton",
  title         = "Toward a Performance/Resilience Tool for Hardware/Software Co-Design of High-Performance Computing Systems",
  booktitle     = "Proceedings of the
                   \href{http://icpp2013.ens-lyon.fr}{$42^{nd}$ International
                   Conference on Parallel Processing (ICPP) 2013}:
                   \href{http://www.psti-workshop.org} {$4^{th}$ International
                   Workshop on Parallel Software Tools and Tool Infrastructures
                   (PSTI)}",
  pages         = "962-971",
  month         = oct # "~2, ",
  year          = "2013",
  address       = "Lyon, France",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-5117-3",
  issn          = "0190-3918",
  doi           = "http://dx.doi.org/10.1109/ICPP.2013.114",
  url           = "http://www.christian-engelmann.info/publications/engelmann13toward.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann13toward.ppt.pdf",
  abstract      = "xSim is a simulation-based performance investigation toolkit
                   that permits running high-performance computing (HPC)
                   applications in a controlled environment with millions of
                   concurrent execution threads, while observing application
                   performance in a simulated extreme-scale system for
                   hardware/software co-design. The presented work details newly
                   developed features for xSim that permit the injection of MPI
                   process failures, the propagation/detection/notification of
                   such failures within the simulation, and their handling using
                   application-level checkpoint/restart. These new capabilities
                   enable the observation of application behavior and
                   performance under failure within a simulated
                   future-generation HPC system using the most common fault
                   handling technique."
}
@conference{lagadapati13tools,
  author        = "Mahesh Lagadapati
                   and Frank Mueller
                   and Christian Engelmann",
  title         = "Tools for Simulation and Benchmark Generation at Exascale",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://tools.zih.tu-dresden.de/2013/}{$7^{th}$
                   Parallel Tools Workshop}",
  pages         = "19--24",
  month         = sep # "~3-4, ",
  year          = "2013",
  address       = "Dresden, Germany",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "978-3-319-08143-4",
  doi           = "http://dx.doi.org/10.1007/978-3-319-08144-1_2",
  url           = "http://www.christian-engelmann.info/publications/lagadapati13tools.pdf",
  url2          = "http://www.christian-engelmann.info/publications/lagadapati13tools.ppt.pdf",
  abstract      = "The path to exascale high-performance computing (HPC) poses several
                   challenges related to power, performance, resilience, productivity,
                   programmability, data movement, and data management. Investigating the
                   performance of parallel applications at scale on future architectures
                   and the performance impact of different architecture choices is an
                   important component of HPC hardware/software co-design. Simulations
                   using models of future HPC systems and communication traces from
                   applications running on existing HPC systems can offer an insight into
                   the performance of future architectures. This work targets technology
                   developed for scalable application tracing of communication events and
                   memory profiles, but can be extended to other areas, such as I/O,
                   control flow, and data flow. It further focuses on extreme-scale
                   simulation of millions of Message Passing Interface (MPI) ranks using
                   a lightweight parallel discrete event simulation (PDES) toolkit for
                   performance evaluation. Instead of simply replaying a trace within a
                   simulation, the approach is to generate a benchmark from it and to run
                   this benchmark within a simulation using models to reflect the
                   performance characteristics of future-generation HPC systems. This
                   provides a number of benefits, such as eliminating the data intensive
                   trace replay and enabling simulations at different scales. The
                   presented work utilizes the ScalaTrace tool to generate scalable trace
                   files, the ScalaBenchGen tool to generate the benchmark, and the xSim
                   tool to run the benchmark within a simulation."
}
@conference{naughton13using,
  author        = "Thomas Naughton
                   and Swen B{\"o}hm
                   and Christian Engelmann
                   and Geoffroy Vall{\'e}e",
  title         = "Using Performance Tools to Support Experiments in HPC
                   Resilience",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://www.europar2013.org/}{$19^{th}$
                   European Conference on Parallel and Distributed Computing
                   (Euro-Par) 2013 Workshops}:
                   \href{http://xcr.cenit.latech.edu/resilience2013}{$6^{th}$
                   Workshop on Resiliency in High Performance Computing
                   (Resilience) in Clusters, Clouds, and Grids}",
  pages         = "727--736",
  month         = aug # "~26, ",
  year          = "2013",
  address       = "Aachen, Germany",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "978-3-642-54419-4",
  issn          = "0302-9743",
  doi           = "http://dx.doi.org/10.1007/978-3-642-54420-0_71",
  url           = "http://www.christian-engelmann.info/publications/naughton13using.pdf",
  url2          = "http://www.christian-engelmann.info/publications/naughton13using.ppt.pdf",
  abstract      = "The high performance computing~(HPC) community is working to
                   address fault tolerance and resilience concerns for current
                   and future large scale computing platforms. This is driving
                   enhancements in the programming environments, specifically
                   research on enhancing message passing libraries to support
                   fault tolerant computing capabilities. The community has
                   also recognized that tools for resilience experimentation
                   are greatly lacking. However, we argue that there are
                   several parallels between ``performance tools'' and
                   ``resilience tools''. As such, we believe the rich set of
                   HPC performance-focused tools can be extended (repurposed)
                   to benefit the resilience community. In this paper, we
                   describe the initial motivation to leverage standard HPC
                   performance analysis techniques to aid in developing
                   diagnostic tools to assist fault tolerance experiments for
                   HPC applications. These diagnosis procedures help to provide
                   context for the system when the errors (failures) occurred.
                   We describe our initial work in leveraging an MPI
                   performance trace tool to assist in providing global context
                   during fault injection experiments. Such tools will assist
                   the HPC resilience community as they extend existing and new
                   application codes to support fault tolerances."
}
@conference{jones11simulation,
  author        = "Ian S. Jones
                   and Christian Engelmann",
  title         = "Simulation of Large-Scale {HPC} Architectures",
  booktitle     = "Proceedings of the
                   \href{http://icpp2011.org}{$40^{th}$ International Conference
                   on Parallel Processing (ICPP) 2011}:
                   \href{http://www.psti-workshop.org} {$2^{nd}$ International
                   Workshop on Parallel Software Tools and Tool Infrastructures
                   (PSTI)}",
  pages         = "447-456",
  month         = sep # "~13-19, ",
  year          = "2011",
  address       = "Taipei, Taiwan",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-4511-0",
  issn          = "1530-2016",
  doi           = "http://dx.doi.org/10.1109/ICPPW.2011.44",
  url           = "http://www.christian-engelmann.info/publications/jones11simulation.pdf",
  url2          = "http://www.christian-engelmann.info/publications/jones11simulation.ppt.pdf",
  abstract      = "The Extreme-scale Simulator (xSim) is a recently developed
                   performance investigation toolkit that permits running
                   high-performance computing (HPC) applications in a controlled
                   environment with millions of concurrent execution threads. It
                   allows observing parallel application performance properties
                   in a simulated extreme-scale HPC system to further assist in
                   HPC hardware and application software co-design on the road
                   toward multi-petascale and exascale computing. This paper
                   presents a newly implemented network model for the xSim
                   performance investigation toolkit that is capable of
                   providing simulation support for a variety of HPC network
                   architectures with the appropriate trade-off between
                   simulation scalability and accuracy. The taken approach
                   focuses on a scalable distributed solution with latency and
                   bandwidth restrictions for the simulated network. Different
                   network architectures, such as star, ring, mesh, torus,
                   twisted torus and tree, as well as hierarchical combinations,
                   such as to simulate network-on-chip and network-on-node, are
                   supported. Network traffic congestion modeling is omitted to
                   gain simulation scalability by reducing simulation accuracy."
}
@conference{fiala11tunable,
  author        = "David Fiala
                   and Kurt Ferreira
                   and Frank Mueller
                   and Christian Engelmann",
  title         = "A Tunable, Software-based {DRAM} Error Detection and
                   Correction Library for {HPC}",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://europar2011.bordeaux.inria.fr/}{$17^{th}$
                   European Conference on Parallel and Distributed Computing
                   (Euro-Par) 2011 Workshops, Part II}:
                   \href{http://xcr.cenit.latech.edu/resilience2011}{$4^{th}$
                   Workshop on Resiliency in High Performance Computing
                   (Resilience) in Clusters, Clouds, and Grids}",
  volume        = "7156",
  pages         = "251-261",
  month         = aug # "~29 - " # sep # "~2, ",
  year          = "2011",
  address       = "Bordeaux, France",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "978-3-642-29740-3",
  doi           = "http://dx.doi.org/10.1007/978-3-642-29740-3_29",
  url           = "http://www.christian-engelmann.info/publications/fiala11tunable.pdf",
  url2          = "",
  abstract      = "Proposed exascale systems will present a number of
                   considerable resiliency challenges. In particular, DRAM
                   soft-errors, or bit-flips, are expected to greatly increase
                   due to the increased memory density of these systems.
                   Current hardware-based fault-tolerance methods will be
                   unsuitable for addressing the expected soft error frequency
                   rate. As a result, additional software will be needed to
                   address this challenge. In this paper we introduce LIBSDC,
                   a tunable, transparent silent data corruption detection and
                   correction library for HPC applications. LIBSDC provides
                   comprehensive SDC protection for program memory by
                   implementing on-demand page integrity verification.
                   Experimental benchmarks with Mantevo HPCCG show that once
                   tuned, LIBSDC is able to achieve SDC protection with 50\%
                   overhead of resources, less than the 100\% needed for double
                   modular redundancy.",
  note          = "Acceptance rate 60.0\% (12/20)"
}
@conference{naughton11case,
  author        = "Thomas Naughton
                   and Geoffroy R. Vall\'ee
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "A Case for Virtual Machine based Fault Injection in a
                   High-Performance Computing Environment",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://europar2011.bordeaux.inria.fr/}{$17^{th}$
                   European Conference on Parallel and Distributed Computing
                   (Euro-Par) 2011}:
                   \href{http://www.csm.ornl.gov/srt/conferences/hpcvirt2011}
                   {$5^{th}$ Workshop on System-level Virtualization for High
                   Performance Computing (HPCVirt)}",
  volume        = "7155",
  pages         = "234-243",
  month         = aug # "~29 - " # sep # "~2, ",
  year          = "2011",
  address       = "Bordeaux, France",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "978-3-642-29737",
  doi           = "http://dx.doi.org/10.1007/978-3-642-29737-3_27",
  url           = "http://www.christian-engelmann.info/publications/naughton11case.pdf",
  url2          = "http://www.christian-engelmann.info/publications/naughton11case.ppt.pdf",
  abstract      = "Large-scale computing platforms provide tremendous
                   capabilities for scientific discovery. These systems have
                   hundreds of thousands of computing cores, hundreds of
                   terabytes of memory, and enormous high-performance
                   interconnection networks. These systems are facing enormous
                   challenges to achieve performance at such scale. Failures
                   are an Achilles heel of these enormous systems. As
                   applications and system software scale up to multi-petaflop
                   and beyond to exascale platforms, the occurrence of failure
                   will be much more common. This has given rise to a push in
                   fault-tolerance and resilience research for HPC systems.
                   This includes work on log analysis to identify types of
                   failures, enhancements to the Message Passing Interface
                   (MPI) to incorporate fault awareness, and a variety of
                   fault tolerance mechanisms that span redundant computation,
                   algorithm based fault tolerance, and advanced checkpoint/
                   restart techniques. While there is much work to be done on
                   the FT/Resilience mechanisms for such large-scale systems,
                   there is also a profound gap in the tools for
                   experimentation. This gap is compounded by the fact that HPC
                   environments have stringent performance requirements and are
                   often highly customized. The tool chain for these systems are
                   often tailored for the platform and while the majority of
                   systems on the Top500 Supercomputer list run Linux, these
                   operating environments typically contain many site/machine
                   specific enhancements. Therefore, it is desirable to maintain
                   a consistent execution environment to minimize end-user
                   (scientist) interruption. The work on system-level
                   virtualization for HPC system offers a unique opportunity to
                   maintain a consistent execution environment via a virtual
                   machine (VM). Recent work on virtualization for HPC has shown
                   that low-overhead, high performance systems can be realized
                   [1, 2] Virtualization also provides a clean abstraction for
                   building experimental tools for investigation into the
                   effects of failures in HPC and the related research on FT/
                   Resilience mechanisms and policies. In this paper we discuss
                   the motivation for tools to perform fault injection in an HPC
                   context, and outline an approach that can leverage
                   virtualization."
}
@conference{engelmann10facilitating,
  author        = "Christian Engelmann
                   and Frank Lauer",
  title         = "Facilitating Co-Design for Extreme-Scale Systems Through
                   Lightweight Simulation",
  booktitle     = "Proceedings of the
                   \href{http://www.cluster2010.org}{$12^{th}$ IEEE
                   International Conference on Cluster Computing (Cluster)
                   2010}: \href{http://www2.wmin.ac.uk/getovv/aacec10.html}
                   {$1^{st}$ Workshop on Application/Architecture Co-design for
                   Extreme-scale Computing (AACEC)}",
  pages         = "1-8",
  month         = sep # "~20-24, ",
  year          = "2010",
  address       = "Hersonissos, Crete, Greece",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-1-4244-8395-2",
  doi           = "http://dx.doi.org/10.1109/CLUSTERWKSP.2010.5613113",
  url           = "http://www.christian-engelmann.info/publications/engelmann10facilitating.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann10facilitating.ppt.pdf",
  abstract      = "This work focuses on tools for investigating algorithm
                   performance at extreme scale with millions of concurrent
                   threads and for evaluating the impact of future architecture
                   choices to facilitate the co-design of high-performance
                   computing (HPC) architectures and applications. The approach
                   focuses on lightweight simulation of extreme-scale HPC
                   systems with the needed amount of accuracy. The prototype
                   presented in this paper is able to provide this capability
                   using a parallel discrete event simulation (PDES), such that
                   a Message Passing Interface (MPI) application can be executed
                   at extreme scale, and its performance properties can be
                   evaluated. The results of an initial prototype are
                   encouraging as a simple hello world MPI program could be
                   scaled up to 1,048,576 virtual MPI processes on a four-node
                   cluster, and the performance properties of two MPI programs
                   could be evaluated at up to 1,024 and 16,384 virtual MPI
                   processes on the same system."
}
@conference{ostrouchov09nonparametric,
  author        = "George Ostrouchov
                   and Thomas Naughton
                   and Christian Engelmann
                   and Geoffroy R. Vall\'ee
                   and Stephen L. Scott",
  title         = "Nonparametric Multivariate Anomaly Analysis in Support of
                   {HPC} Resilience",
  booktitle     = "Proceedings of the \href{http://www.oerc.ox.ac.uk/ieee}
                   {$5^{th}$ IEEE International Conference on e-Science
                   (e-Science) 2009}:
                   \href{http://www.oerc.ox.ac.uk/ieee/workshops/workshops/computational-science}
                   {Workshop on Computational Science}",
  pages         = "80-85",
  month         = dec # "~9-11, ",
  year          = "2009",
  address       = "Oxford, UK",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-1-4244-5946-9",
  doi           = "http://dx.doi.org/10.1109/ESCIW.2009.5407992",
  url           = "http://www.christian-engelmann.info/publications/ostrouchov09nonparametric.pdf",
  url2          = "http://www.christian-engelmann.info/publications/ostrouchov09nonparametric.ppt.pdf",
  abstract      = "Large-scale computing systems provide great potential for
                   scientific exploration. However, the complexity that
                   accompanies these enormous machines raises challeges for
                   both, users and operators. The effective use of such systems
                   is often hampered by failures encountered when running
                   applications on systems containing tens-of-thousands of nodes
                   and hundreds-of-thousands of compute cores capable of
                   yielding petaflops of performance. In systems of this size
                   failure detection is complicated and root-cause diagnosis
                   difficult. This paper describes our recent work in the
                   identification of anomalies in monitoring data and system
                   logs to provide further insights into machine status, runtime
                   behavior, failure modes and failure root causes. It discusses
                   the details of an initial prototype that gathers the data and
                   uses statistical techniques for analysis."
}
@conference{naughton09fault,
  author        = "Thomas Naughton
                   and Wesley Bland
                   and Geoffroy R. Vall\'ee
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Fault Injection Framework for System Resilience Evaluation --
                   {F}ake Faults for Finding Future Failures",
  booktitle     = "Proceedings of the
                   \href{http://www.lrz-muenchen.de/hpdc2009}{$18^{th}$
                   International Symposium on High Performance Distributed
                   Computing (HPDC) 2009}:
                   \href{http://xcr.cenit.latech.edu/resilience2009}{$2^{nd}$
                   Workshop on Resiliency in High Performance Computing
                   (Resilience) 2009}",
  pages         = "23--28",
  month         = jun # "~9, ",
  year          = "2009",
  address       = "Munich, Germany",
  publisher     = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
  isbn          = "978-1-60558-587-1",
  doi           = "http://doi.acm.org/10.1145/1552526.1552530",
  url           = "http://www.christian-engelmann.info/publications/naughton09fault.pdf",
  url2          = "http://www.christian-engelmann.info/publications/naughton09fault.ppt.pdf",
  abstract      = "As high-performance computing (HPC) systems increase in size
                   and complexity they become more difficult to manage. The
                   enormous component counts associated with these large systems
                   lead to significant challenges in system reliability and
                   availability. This in turn is driving research into the
                   resilience of large scale systems, which seeks to curb the
                   effects of increased failures at large scales by masking the
                   inevitable faults in these systems. The basic premise being
                   that failure must be accepted as a reality of large scale
                   system and coped with accordingly through system resilience.
                   A key component in the development and evaluation of system
                   resilience techniques is having a means to conduct controlled
                   experiments. A common method for performing such experiments
                   is to generate synthetic faults and study the resulting
                   effects. In this paper we discuss the motivation and our
                   initial use of software fault injection to support the
                   evaluation of resilience for HPC systems. We mention
                   background and related work in the area and discuss the
                   design of a tool to aid in fault injection experiments for
                   both user-space (application-level) and system-level
                   failures."
}
@conference{tikotekar09performance,
  author        = "Anand Tikotekar
                   and Hong H. Ong
                   and Sadaf Alam
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Performance Comparison of Two Virtual Machine Scenarios Using
                   an {HPC} Application -- {A} Case study Using Molecular
                   Dynamics Simulations",
  booktitle     = "Proceedings of the
                   \href{http://www.csm.ornl.gov/srt/hpcvirt09}{$3^{rd}$
                   Workshop on System-level Virtualization for High Performance
                   Computing (HPCVirt) 2009}, in conjunction with the
                   \href{http://www.eurosys.org/2009}{$4^{th}$ ACM SIGOPS
                   European Conference on Computer Systems (EuroSys) 2009}",
  pages         = "33--40",
  month         = mar # "~30, ",
  year          = "2009",
  address       = "Nuremberg, Germany",
  publisher     = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
  isbn          = "978-1-60558-465-2",
  doi           = "http://doi.acm.org/10.1145/1519138.1519143",
  url           = "http://www.christian-engelmann.info/publications/tikotekar09performance.pdf",
  url2          = "http://www.christian-engelmann.info/publications/tikotekar09performance.ppt.pdf",
  abstract      = "Obtaining high flexibility to performance-loss ratio is a
                   key challenge of today's HPC virtual environment landscape.
                   And while extensive research has been targeted at extracting
                   more performance from virtual machines, the idea that whether
                   novel virtual machine usage scenarios could lead to high
                   flexibility Vs performance trade-off has received less
                   attention. We, in this paper, take a step forward by studying
                   and comparing the performance implications of running the
                   Large-scale Atomic/Molecular Massively Parallel Simulator
                   (LAMMPS) application on two virtual machine configurations.
                   First configuration consists of two virtual machines per node
                   with 1 application process per virtual machine. The second
                   configuration consists of 1 virtual machine per node with 2
                   processes per virtual machine. Xen has been used as an
                   hypervisor and standard Linux as a guest virtual machine. Our
                   results show that the difference in overall performance
                   impact on LAMMPS between the two virtual machine
                   configurations described above is around 3\%. We also study
                   the difference in performance impact in terms of each
                   configuration's individual metrics such as CPU, I/O, Memory,
                   and interrupt/context switches."
}
@conference{vallee08virtual,
  author        = "Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Hong H. Ong
                   and Anand Tikotekar
                   and Christian Engelmann
                   and Wesley Bland
                   and Ferrol Aderholt
                   and Stephen L. Scott",
  title         = "Virtual System Environments",
  booktitle     = "Communications in Computer and Information Science:
                   Proceedings of the \href{http://www.dmtf.org/svm08}{$2^{nd}$
                   DMTF Academic Alliance Workshop on Systems and Virtualization
                   Management: Standards and New Technologies (SVM) 2008}",
  volume        = "18",
  pages         = "72--83",
  month         = oct # "~21-22, ",
  year          = "2008",
  address       = "Munich, Germany",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "978-3-540-88707-2",
  issn          = "1865-0929",
  doi           = "http://dx.doi.org/10.1007/978-3-540-88708-9_7",
  url           = "http://www.christian-engelmann.info/publications/vallee08virtual.pdf",
  url2          = "",
  abstract      = "Distributed and parallel systems are typically managed with
                   static settings: the operating system (OS) and the runtime
                   environment (RTE) are specified at a given time and cannot be
                   changed to fit an application`s needs. This means that every
                   time application developers want to use their application on
                   a new execution platform, the application has to be ported to
                   this new environment, which may be expensive in terms of
                   application modifications and developer time. However, the
                   science resides in the applications and not in the OS or the
                   RTE. Therefore, it should be beneficial to adapt the OS and
                   the RTE to the application instead of adapting the
                   applications to the OS and the RTE. This document presents
                   the concept of Virtual System Environments (VSE), which
                   enables application developers to specify and create a
                   virtual environment that properly fits their application`s
                   needs. For that four challenges have to be addressed: (i)
                   definition of the VSE itself by the application developers,
                   (ii) deployment of the VSE, (iii) system administration for
                   the platform, and (iv) protection of the platform from the
                   running VSE. We therefore present an integrated tool for the
                   definition and deployment of VSEs on top of traditional and
                   virtual (i.e., using system-level virtualization) execution
                   platforms. This tool provides the capability to choose the
                   degree of delegation for system administration tasks and the
                   degree of protection from the application (e.g., using
                   virtual machines). To summarize, the VSE concept enables the
                   customization of the OS/RTE used for the execution of
                   application by users without compromising local system
                   administration rules and execution platform protection
                   constraints."
}
@conference{tikotekar08analysis,
  author        = "Anand Tikotekar
                   and Geoffroy Vall\'ee
                   and Thomas Naughton
                   and Hong H. Ong
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "An Analysis of {HPC} Benchmark Applications in Virtual
                   Machine Environments",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://europar2008.caos.uab.es}{$14^{th}$ European
                   Conference on Parallel and Distributed Computing (Euro-Par)
                   2008}: \href{http://scilytics.com/vhpc}{$3^{rd}$ Workshop on
                   Virtualization in High-Performance Cluster and Grid Computing
                   (VHPC) 2008}",
  volume        = "5415",
  pages         = "63--71",
  month         = aug # "~26-29, ",
  year          = "2008",
  address       = "Las Palmas de Gran Canaria, Spain",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "978-3-642-00954-9",
  doi           = "http://dx.doi.org/10.1007/978-3-642-00955-6",
  url           = "http://www.christian-engelmann.info/publications/tikotekar08analysis.pdf",
  url2          = "http://www.christian-engelmann.info/publications/tikotekar08analysis.ppt.pdf",
  abstract      = "Virtualization technology has been gaining acceptance in the
                   scientific community due to its overall flexibility in
                   running HPC applications. It has been reported that a
                   specific class of applications is better suited to a
                   particular type of virtualization scheme or implementation.
                   For example, Xen has been shown to perform with little
                   overhead for compute-bound applications. Such a study,
                   although useful, does not allow us to generalize conclusions
                   beyond the performance analysis of that application which is
                   explicitly executed. An explanation of why the generalization
                   described above is difficult, may be due to the versatility
                   in applications, which leads to different overheads in
                   virtual environments. For example, two similar applications
                   may spend disproportionate amount of time in their respective
                   library code when run in virtual environments. In this paper,
                   we aim to study such potential causes by investigating the
                   behavior and identifying patterns of various overheads for
                   HPC benchmark applications. Based on the investigation of the
                   overhead profiles for different benchmarks, we aim to address
                   questions such as: Are the overhead profiles for a particular
                   type of benchmarks (such as compute-bound) similar or are
                   there grounds to conclude otherwise?"
}
@conference{engelmann08symmetric2,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Chokchai (Box) Leangsuksun
                   and Xubin (Ben) He",
  title         = "Symmetric Active/Active High Availability for
                   High-Performance Computing System Services: Accomplishments
                   and Limitations",
  booktitle     = "Proceedings of the
                   \href{http://www.ens-lyon.fr/LIP/RESO/ccgrid2008}{$8^{th}$
                   IEEE International Symposium on Cluster Computing and the
                   Grid (CCGrid) 2008}:
                   \href{http://xcr.cenit.latech.edu/resilience2008}{Workshop on
                   Resiliency in High Performance Computing (Resilience) 2008}",
  pages         = "813--818",
  month         = may # "~19-22, ",
  year          = "2008",
  address       = "Lyon, France",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-3156-4",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/CCGRID.2008.78",
  url           = "http://www.christian-engelmann.info/publications/engelmann08symmetric2.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann08symmetric2.pdf",
  abstract      = "This paper summarizes our efforts over the last 3-4 years in
                   providing symmetric active/active high availability for
                   high-performance computing (HPC) system services. This work
                   paves the way for high-level reliability, availability and
                   serviceability in extreme-scale HPC systems by focusing on
                   the most critical components, head and service nodes, and by
                   reinforcing them with appropriate high availability
                   solutions. This paper presents our accomplishments in the
                   form of concepts and respective prototypes, discusses
                   existing limitations, outlines possible future work, and
                   describes the relevance of this research to other, planned
                   efforts."
}
@conference{chen08online,
  author        = "Xin Chen
                   and Benjamin Eckart
                   and Xubin (Ben) He
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "An Online Controller Towards Self-Adaptive File System
                   Availability and Performance",
  booktitle     = "Proceedings of the
                   \href{http://xcr.cenit.latech.edu/hapcw2008}{$5^{th}$ High
                   Availability and Performance Workshop (HAPCW) 2008}, in
                   conjunction with the \href{http://www.hpcsw.org}{$1^{st}$
                   High-Performance Computer Science Week (HPCSW) 2008}",
  month         = apr # "~3-4, ",
  year          = "2008",
  address       = "Denver, CO, USA",
  url           = "http://www.christian-engelmann.info/publications/chen08online.pdf",
  url2          = "http://www.christian-engelmann.info/publications/chen08online.ppt.pdf",
  abstract      = "At the present time, it can be a significant challenge to
                   build a large-scale distributed file system that
                   simultaneously maintains both high availability and high
                   performance. Although many fault tolerance technologies have
                   been proposed and used in both commercial and academic
                   distributed file systems to achieve high availability, most
                   of them typically sacrifice performance for higher system
                   availability. Additionally, recent studies show that system
                   availability and performance are related to the system
                   workload. In this paper, we analyze the correlations among
                   availability, performance, and workloads based on a
                   replication strategy, and we discuss the trade off between
                   availability and performance with different workloads. Our
                   analysis leads to the design of an online controller that can
                   dynamically achieve optimal performance and availability by
                   tuning the system replication policy."
}
@conference{tikotekar08effects,
  author        = "Anand Tikotekar
                   and Geoffroy Vall\'ee
                   and Thomas Naughton
                   and Hong H. Ong
                   and Christian Engelmann
                   and Stephen L. Scott
                   and Anthony M. Filippi",
  title         = "Effects of Virtualization on a Scientific Application --
                   {R}unning a Hyperspectral Radiative Transfer Code on Virtual
                   Machines",
  booktitle     = "Proceedings of the
                   \href{http://www.csm.ornl.gov/srt/hpcvirt08}{$2^{nd}$
                   Workshop on System-level Virtualization for High Performance
                   Computing (HPCVirt) 2008}, in conjunction with the
                   \href{http://www.eurosys.org/2008}{$3^{rd}$ ACM SIGOPS
                   European Conference on Computer Systems (EuroSys) 2008}",
  pages         = "16--23",
  month         = mar # "~31, ",
  year          = "2008",
  address       = "Glasgow, UK",
  publisher     = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
  isbn          = "978-1-60558-120-0",
  doi           = "http://doi.acm.org/10.1145/1435452.1435455",
  url           = "http://www.christian-engelmann.info/publications/tikotekar08effects.pdf",
  url2          = "http://www.christian-engelmann.info/publications/tikotekar08effects.ppt.pdf",
  abstract      = "The topic of system-level virtualization has recently begun
                   to receive interest for high performance computing (HPC).
                   This is in part due to the isolation and encapsulation
                   offered by the virtual machine. These traits enable
                   applications to customize their environments and maintain
                   consistent software configurations in their virtual domains.
                   Additionally, there are mechanisms that can be used for fault
                   tolerance like live virtual machine migration. Given these
                   attractive benefits to virtualization, a fundamental question
                   arises, how does this effect my scientific application? We
                   use this as the premise for our paper and observe a
                   real-world scientific code running on a Xen virtual machine.
                   We studied the effects of running a radiative transfer
                   simulation, Hydrolight, on a virtual machine. We discuss our
                   methodology and report observations regarding the usage of
                   virtualization with this application."
}
@conference{engelmann07middleware,
  author        = "Christian Engelmann
                   and Hong H. Ong
                   and Stephen L. Scott",
  title         = "Middleware in Modern High Performance Computing System
                   Architectures",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://www.iccs-meeting.org/iccs2007}{$7^{th}$
                   International Conference on Computational Science (ICCS)
                   2007}, Part II: \href{http://www.gup.uni-linz.ac.at/cce2007}
                   {$4^{th}$ Special Session on Collaborative and Cooperative
                   Environments (CCE) 2007}",
  volume        = "4488",
  pages         = "784--791",
  month         = may # "~27-30, ",
  year          = "2007",
  address       = "Beijing, China",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "3-5407-2585-5",
  issn          = "0302-9743",
  doi           = "http://dx.doi.org/10.1007/978-3-540-72586-2_111",
  url           = "http://www.christian-engelmann.info/publications/engelmann07middleware.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann07middleware.ppt.pdf",
  abstract      = "A recent trend in modern high performance computing (HPC)
                   system architectures employs lean compute nodes running a
                   lightweight operating system (OS). Certain parts of the OS a
                   well as other system software services are moved to service
                   nodes in order to increase performance and scalability. This
                   paper examines the impact of this HPC system architecture
                   trend on HPC middleware software solutions, which
                   traditionally equip HPC systems with advanced features, such
                   as parallel and distributed programming models, appropriate
                   system resource management mechanisms, remote application
                   steering and user interaction techniques. Since the approach
                   of keeping the compute node software stack small and simple
                   is orthogonal to the middleware concept of adding missing OS
                   features between OS and application, the role and
                   architecture of middleware in modern HPC systems needs to be
                   revisited. The result is a paradigm shift in HPC middleware
                   design, where single middleware services are moved to service
                   nodes, while runtime environments (RTEs) continue to reside
                   on compute nodes."
}
@conference{engelmann07transparent,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Chokchai (Box) Leangsuksun
                   and Xubin (Ben) He",
  title         = "Transparent Symmetric Active/Active Replication for
                   Service-Level High Availability",
  booktitle     = "Proceedings of the \href{http://ccgrid07.lncc.br}{$7^{th}$
                   IEEE International Symposium on Cluster Computing and the
                   Grid (CCGrid) 2007}: \href{http://www.lri.fr/~fedak/gp2pc-07}
                   {$7^{th}$ International Workshop on Global and Peer-to-Peer
                   Computing (GP2PC) 2007}",
  pages         = "755--760",
  month         = may # "~14-17, ",
  year          = "2007",
  address       = "Rio de Janeiro, Brazil",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "0-7695-2833-3",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/CCGRID.2007.116",
  url           = "http://www.christian-engelmann.info/publications/engelmann07transparent.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann07transparent.ppt.pdf",
  abstract      = "As service-oriented architectures become more important in
                   parallel and distributed computing systems, individual
                   service instance reliability as well as appropriate service
                   redundancy becomes an essential necessity in order to
                   increase overall system availability. This paper focuses on
                   providing redundancy strategies using service-level
                   replication techniques. Based on previous research using
                   symmetric active/active replication, this paper proposes a
                   transparent symmetric active/active replication approach that
                   allows for more reuse of code between individual
                   service-level replication implementations by using a virtual
                   communication layer. Service- and client-side interceptors
                   are utilized in order to provide total transparency. Clients
                   and servers are unaware of the replication infrastructure as
                   it provides all necessary mechanisms internally."
}
@conference{engelmann07configurable,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Hong H. Ong
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton",
  title         = "Configurable Virtualized System Environments for High
                   Performance Computing",
  booktitle     = "Proceedings of the
                   \href{http://www.csm.ornl.gov/srt/hpcvirt07}{$1^{st}$
                   Workshop on System-level Virtualization for High Performance
                   Computing (HPCVirt) 2007}, in conjunction with the
                   \href{http://www.eurosys.org/2008}{$2^{nd}$ ACM SIGOPS
                   European Conference on Computer Systems (EuroSys) 2007}",
  month         = mar # "~20, ",
  year          = "2007",
  address       = "Lisbon, Portugal",
  url           = "http://www.christian-engelmann.info/publications/engelmann07configurable.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann07configurable.ppt.pdf",
  abstract      = "Existing challenges for current terascale high performance
                   computing (HPC) systems are increasingly hampering the
                   development and deployment efforts of system software and
                   scientific applications for next-generation petascale
                   systems. The expected rapid system upgrade interval toward
                   petascale scientific computing demands an incremental
                   strategy for the development and deployment of legacy and new
                   large-scale scientific applications that avoids excessive
                   porting. Furthermore, system software developers as well as
                   scientific application developers require access to
                   large-scale testbed environments in order to test individual
                   solutions at scale. This paper proposes to address these
                   issues at the system software level through the development
                   of a virtualized system environment (VSE) for scientific
                   computing. The proposed VSE approach enables
                   plug-and-play supercomputing through
                   desktop-to-cluster-to-petaflop computer system-level
                   virtualization based on recent advances in hypervisor
                   virtualization technologies. This paper describes the VSE
                   system architecture in detail, discusses needed tools for
                   VSE system management and configuration, and presents
                   respective VSE use case scenarios."
}
@conference{engelmann06towards,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Chokchai (Box) Leangsuksun
                   and Xubin (Ben) He",
  title         = "Towards High Availability for High-Performance Computing
                   System Services: {A}ccomplishments and Limitations",
  booktitle     = "Proceedings of the
                   \href{http://xcr.cenit.latech.edu/hapcw2006}{$4^{th}$ High
                   Availability and Performance Workshop (HAPCW) 2006}, in
                   conjunction with the \href{http://lacsi.krellinst.org}
                   {$7^{th}$ Los Alamos Computer Science Institute (LACSI)
                   Symposium 2006}",
  month         = oct # "~17, ",
  year          = "2006",
  address       = "Santa Fe, NM, USA",
  url           = "http://www.christian-engelmann.info/publications/engelmann06towards.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann06towards.ppt.pdf",
  abstract      = "During the last several years, our teams at Oak Ridge
                   National Laboratory, Louisiana Tech University, and Tennessee
                   Technological University focused on efficient redundancy
                   strategies for head and service nodes of high-performance
                   computing (HPC) systems in order to pave the way for high
                   availability (HA) in HPC. These nodes typically run critical
                   HPC system services, like job and resource management, and
                   represent single points of failure and control for an entire
                   HPC system. The overarching goal of our research is to
                   provide high-level reliability, availability, and
                   serviceability (RAS) for HPC systems by combining HA and HPC
                   technology. This paper summarizes our accomplishments, such
                   as developed concepts and implemented proof-of-concept
                   prototypes, and describes existing limitations, such as
                   performance issues, which need to be dealt with for
                   production-type deployment."
}
@conference{ou06achieving,
  author        = "Li Ou
                   and Xin Chen
                   and Xubin (Ben) He
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Achieving Computational {I/O} Effciency in a High Performance
                   Cluster Using Multicore Processors",
  booktitle     = "Proceedings of the
                   \href{http://xcr.cenit.latech.edu/hapcw2006}{$4^{th}$ High
                   Availability and Performance Workshop (HAPCW) 2006}, in
                   conjunction with the \href{http://lacsi.krellinst.org}
                   {$7^{th}$ Los Alamos Computer Science Institute (LACSI)
                   Symposium 2006}",
  month         = oct # "~17, ",
  year          = "2006",
  address       = "Santa Fe, NM, USA",
  url           = "http://www.christian-engelmann.info/publications/ou06achieving.pdf",
  url2          = "http://www.christian-engelmann.info/publications/ou06achieving.ppt.pdf",
  abstract      = "Cluster computing has become one of the most popular
                   platforms for high-performance computing today. The recent
                   popularity of multicore processors provides a flexible way to
                   increase the computational capability of clusters. Although
                   the system performance may improve with multicore processors
                   in a cluster, I/O requests initiated by multiple cores may
                   saturate the I/O bus, and furthermore increase the latency by
                   issuing  multiple non-contiguous disk accesses. In this
                   paper, we propose an asymmetric collective I/O for multicore
                   processors to improve multiple non-contiguous accesses. In
                   our configuration, one core in each multicore processor is
                   designated as the coordinator, and others serve as computing
                   cores. The coordinator is responsible for aggregating I/O
                   operations from computing cores and submitting a contiguous
                   request. The coordinator allocates contiguous memory buffers
                   on behalf of other cores to avoid redundant data copies."
}
@conference{engelmann06rmix,
  author        = "Christian Engelmann
                   and George A. (Al) Geist",
  title         = "{RMIX}: {A} Dynamic, Heterogeneous, Reconfigurable
                   Communication Framework",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://www.iccs-meeting.org/iccs2006}{$6^{th}$
                   International Conference on Computational Science (ICCS)
                   2006}, Part II: \href{http://www.gup.uni-linz.ac.at/cce2006}
                   {$3^{rd}$ Special Session on Collaborative and Cooperative
                   Environments (CCE) 2006}",
  volume        = "3992",
  pages         = "573--580",
  month         = may # "~28-31, ",
  year          = "2006",
  address       = "Reading, UK",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "3-540-34381-4",
  issn          = "0302-9743",
  doi           = "http://dx.doi.org/10.1007/11758525_77",
  url           = "http://www.christian-engelmann.info/publications/engelmann06rmix.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann06rmix.ppt.pdf",
  abstract      = "RMIX is a dynamic, heterogeneous, reconfigurable
                   communication framework that allows software components to
                   communicate using various RMI/RPC protocols, such as ONC RPC,
                   Java RMI and SOAP, by facilitating dynamically loadable
                   provider plug-ins to supply different protocol stacks. With
                   this paper, we present a native (C-based), flexible,
                   adaptable, multi-protocol RMI/RPC communication framework
                   that complements the Java-based RMIX variant previously
                   developed by our partner team at Emory University. Our
                   approach offers the same multi-protocol RMI/RPC services
                   and advanced invocation semantics via a C-based interface
                   that does not require an object-oriented programming
                   language. This paper provides a detailed description of our
                   RMIX framework architecture and some of its features. It
                   describes the general use case of the RMIX framework and its
                   integration into the Harness metacomputing environment in the
                   form of a plug-in."
}
@conference{engelmann06active,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Chokchai (Box) Leangsuksun
                   and Xubin (Ben) He",
  title         = "Active/Active Replication for Highly Available {HPC} System
                   Services",
  booktitle     = "Proceedings of the
                   \href{http://www.ares-conference.eu/ares2006}{$1^{st}$
                   International Conference on Availability, Reliability and
                   Security (ARES) 2006}: $1^{st}$ International Workshop on
                   Frontiers in Availability, Reliability and Security (FARES)
                   2006",
  pages         = "639-645",
  month         = apr # "~20-22, ",
  year          = "2006",
  address       = "Vienna, Austria",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "0-7695-2567-9",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/ARES.2006.23",
  url           = "http://www.christian-engelmann.info/publications/engelmann06active.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann06active.ppt.pdf",
  abstract      = "Today`s high performance computing systems have several
                   reliability deficiencies resulting in availability and
                   serviceability issues. Head and service nodes represent a
                   single point of failure and control for an entire system as
                   they render it inaccessible and unmanageable in case of a
                   failure until repair, causing a significant downtime. This
                   paper introduces two distinct replication methods (internal
                   and external) for providing symmetric active/active high
                   availability for multiple head and service nodes running in
                   virtual synchrony. It presents a comparison of both methods
                   in terms of expected correctness, ease-of-use and performance
                   based on early results from ongoing work in providing
                   symmetric active/active high availability for two HPC system
                   services (TORQUE and PVFS metadata server). It continues with
                   a short description of a distributed mutual exclusion
                   algorithm and a brief statement regarding the handling of
                   Byzantine failures. This paper concludes with an overview of
                   past and ongoing work, and a short summary of the presented
                   research."
}
@conference{engelmann05concepts,
  author        = "Christian Engelmann
                   and Stephen L. Scott",
  title         = "Concepts for High Availability in Scientific High-End
                   Computing",
  booktitle     = "Proceedings of the
                   \href{http://xcr.cenit.latech.edu/hapcw2005}{$3^{rd}$ High
                   Availability and Performance Workshop (HAPCW) 2005}, in
                   conjunction with the
                   \href{http://lacsi.rice.edu/symposium/agenda_2005}{$6^{th}$
                   Los Alamos Computer Science Institute (LACSI) Symposium
                   2005}",
  month         = oct # "~11, ",
  year          = "2005",
  address       = "Santa Fe, NM, USA",
  url           = "http://www.christian-engelmann.info/publications/engelmann05concepts.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann05concepts.ppt.pdf",
  abstract      = "Scientific high-end computing (HEC) has become an important
                   tool for scientists world-wide to understand problems, such
                   as in nuclear fusion, human genomics and nanotechnology.
                   Every year, new HEC systems emerge on the market with better
                   performance and higher scale. With only very few exceptions,
                   the overall availability of recently installed systems has
                   been lower in comparison to the same deployment phase of
                   their predecessors. In contrast to the experienced loss of
                   availability, the demand for continuous availability has
                   risen dramatically due to the recent trend towards capability
                   computing. In this paper, we analyze the existing
                   deficiencies of current HEC systems and present several high
                   availability concepts to counter the experienced loss of
                   availability and to alleviate the expected impact on
                   next-generation systems. We explain the application of these
                   concepts to current and future HEC systems and list past and
                   ongoing related research. This paper closes with a short
                   summary of the presented work and a brief discussion of
                   future efforts."
}
@conference{engelmann05high,
  author        = "Christian Engelmann
                   and Stephen L. Scott",
  title         = "High Availability for Ultra-Scale High-End Scientific
                   Computing",
  booktitle     = "Proceedings of the \href{http://coset.irisa.fr}{$2^{nd}$
                   International Workshop on Operating Systems, Programming
                   Environments and Management Tools for High-Performance
                   Computing on Clusters (COSET-2) 2005}, in conjunction with
                   the \href{http://ics05.csail.mit.edu}{$19^{th}$ ACM
                   International Conference on Supercomputing (ICS) 2005}",
  month         = jun # "~19, ",
  year          = "2005",
  address       = "Cambridge, MA, USA",
  url           = "http://www.christian-engelmann.info/publications/engelmann05high.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann05high.ppt.pdf",
  abstract      = "Ultra-scale architectures for scientific high-end computing
                   with tens to hundreds of thousands of processors, such as the
                   IBM Blue Gene/L and the Cray X1, suffer from availability
                   deficiencies, which impact the efficiency of running
                   computational jobs by forcing frequent checkpointing of
                   applications. Most systems are unable to handle runtime
                   system configuration changes caused by failures and require
                   a complete restart of essential system services, such as the
                   job scheduler or MPI, or even of the entire machine. In this
                   paper, we present a flexible, pluggable and component-based
                   high availability framework that expands today`s effort in
                   high availability computing of keeping a single server alive
                   to include all machines cooperating in a high-end scientific
                   computing environment, while allowing adaptation to system
                   properties and application needs."
}
@conference{leangsuksun05asymmetric,
  author        = "Chokchai (Box) Leangsuksun
                   and Venkata K. Munganuru
                   and Tong Liu
                   and Stephen L. Scott
                   and Christian Engelmann",
  title         = "Asymmetric Active-Active High Availability for High-end
                   Computing",
  booktitle     = "Proceedings of the \href{http://coset.irisa.fr}{$2^{nd}$
                   International Workshop on Operating Systems, Programming
                   Environments and Management Tools for High-Performance
                   Computing on Clusters (COSET-2) 2005}, in conjunction with
                   the \href{http://ics05.csail.mit.edu}{$19^{th}$ ACM
                   International Conference on Supercomputing (ICS) 2005}",
  month         = jun # "~19, ",
  year          = "2005",
  address       = "Cambridge, MA, USA",
  url           = "http://www.christian-engelmann.info/publications/leangsuksun05asymmetric.pdf",
  url2          = "http://www.christian-engelmann.info/publications/leangsuksun05asymmetric.ppt.pdf",
  abstract      = "Linux clusters have become very popular for scientific
                   computing at research institutions world-wide, because they
                   can be easily deployed at a fairly low cost. However, the
                   most pressing issues of today`s cluster solutions are
                   availability and serviceability. The conventional Beowulf
                   cluster architecture has a single head node connected to a
                   group of compute nodes. This head node is a typical single
                   point of failure and control, which severely limits
                   availability and serviceability by effectively cutting off
                   healthy compute nodes from the outside world upon overload
                   or failure. In this paper, we describe a paradigm that
                   addresses this issue using asymmetric active-active high
                   availability. Our framework comprises of n + 1 head nodes,
                   where n head nodes are active in the sense that they provide
                   services to simultaneously incoming user requests. One
                   standby server monitors all active servers and performs a
                   fail-over in case of a detected outage. We present a
                   prototype implementation based on a 2 + 1 solution and
                   discuss initial results."
}
@conference{engelmann05lightweight,
  author        = "Christian Engelmann
                   and George A. (Al) Geist",
  title         = "A Lightweight Kernel for the Harness Metacomputing
                   Framework",
  booktitle     = "Proceedings of the
                   \href{http://www.ipdps.org/ipdps2005}{$19^{th}$ IEEE
                   International Parallel and Distributed Processing Symposium
                   (IPDPS) 2005}: \href{http://www.cs.umass.edu/~rsnbrg/hcw2005}
                   {$14^{th}$ Heterogeneous Computing Workshop (HCW) 2005}",
  month         = apr # "~4, ",
  year          = "2005",
  address       = "Denver, CO, USA",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "0-7695-2312-9",
  issn          = "1530-2075",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/IPDPS.2005.34",
  url           = "http://www.christian-engelmann.info/publications/engelmann05lightweight.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann05lightweight.ppt.pdf",
  abstract      = "Harness is a pluggable heterogeneous Distributed Virtual
                   Machine (DVM) environment for parallel and distributed
                   scientific computing. This paper describes recent
                   improvements in the Harness kernel design. By using a
                   lightweight approach and moving previously integrated system
                   services into software modules, the software becomes more
                   versatile and adaptable. This paper outlines these changes
                   and explains the major Harness kernel components in more
                   detail. A short overview is given of ongoing efforts in
                   integrating RMIX, a dynamic heterogeneous reconfigurable
                   communication framework, into the Harness environment as a
                   new plug-in software module. We describe the overall impact
                   of these changes and how they relate to other ongoing work."
}
@conference{engelmann04high,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and George A. (Al) Geist",
  title         = "High Availability through Distributed Control",
  booktitle     = "Proceedings of the
                   \href{http://xcr.cenit.latech.edu/hapcw2004}{$2^{nd}$ High
                   Availability and Performance Workshop (HAPCW) 2004}, in
                   conjunction with the
                   \href{http://lacsi.rice.edu/symposium/agenda_2004}{$5^{th}$
                   Los Alamos Computer Science Institute (LACSI) Symposium
                   2004}",
  month         = oct # "~12, ",
  year          = "2004",
  address       = "Santa Fe, NM, USA",
  url           = "http://www.christian-engelmann.info/publications/engelmann04high.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann04high.ppt.pdf",
  abstract      = "Cost-effective, flexible and efficient scientific simulations
                   in cutting-edge research areas utilize huge high-end
                   computing resources with thousands of processors. In the next
                   five to ten years the number of processors in such computer
                   systems will rise to tens of thousands, while scientific
                   application running times are expected to increase further
                   beyond the Mean-Time-To-Interrupt (MTTI) of hardware and
                   system software components. This paper describes the ongoing
                   research in heterogeneous adaptable reconfigurable networked
                   systems (Harness) and its recent achievements in the area of
                   high availability distributed virtual machine environments
                   for parallel and distributed scientific computing. It shows
                   how a distributed control algorithm is able to steer a
                   distributed virtual machine process in virtual synchrony
                   while maintaining consistent replication for high
                   availability. It briefly illustrates ongoing work in
                   heterogeneous reconfigurable communication frameworks and
                   security mechanisms. The paper continues with a short
                   overview of similar research in reliable group communication
                   frameworks, fault-tolerant process groups and highly
                   available distributed virtual processes. It closes with a
                   brief discussion of possible future research directions."
}
@conference{he04highly,
  author        = "Xubin (Ben) He
                   and Li Ou
                   and Stephen L. Scott
                   and Christian Engelmann",
  title         = "A Highly Available Cluster Storage System using Scavenging",
  booktitle     = "Proceedings of the
                   \href{http://xcr.cenit.latech.edu/hapcw2004}{$2^{nd}$ High
                   Availability and Performance Workshop (HAPCW) 2004}, in
                   conjunction with the
                   \href{http://lacsi.rice.edu/symposium/agenda_2004}{$5^{th}$
                   Los Alamos Computer Science Institute (LACSI) Symposium
                   2004}",
  month         = oct # "~12, ",
  year          = "2004",
  address       = "Santa Fe, NM, USA",
  url           = "http://www.christian-engelmann.info/publications/he04highly.pdf",
  url2          = "http://www.christian-engelmann.info/publications/he04highly.ppt.pdf",
  abstract      = "Highly available data storage for high-performance computing
                   is becoming increasingly more critical as high-end computing
                   systems scale up in size and storage systems are developed
                   around network-centered architectures. A promising solution
                   is to harness the collective storage potential of individual
                   workstations much as we harness idle CPU cycles due to the
                   excellent price/performance ratio and low storage usage of
                   most commodity workstations. For such a storage system,
                   metadata consistency is a key issue assuring storage system
                   availability as well as data reliability. In this paper, we
                   present a decentralized metadata management scheme that
                   improves storage availability without sacrificing
                   performance."
}
@conference{engelmann03diskless,
  author        = "Christian Engelmann
                   and George A. (Al) Geist",
  title         = "A Diskless Checkpointing Algorithm for Super-scale
                   Architectures Applied to the Fast Fourier Transform",
  booktitle     = "Proceedings of the
                   \href{http://www.cs.msstate.edu/~clade2003}{Challenges of
                   Large Applications in Distributed Environments Workshop
                   (CLADE) 2003}, in conjunction with the
                   \href{http://csag.ucsd.edu/HPDC-12}{$12^{th}$ IEEE
                   International Symposium on High Performance Distributed
                   Computing (HPDC) 2003}",
  pages         = "47",
  month         = jun # "~21, ",
  year          = "2003",
  address       = "Seattle, WA, USA",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "0-7695-1984-9",
  doi           = "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4159902",
  url           = "http://www.christian-engelmann.info/publications/engelmann03diskless.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann03diskless.ppt.pdf",
  abstract      = "This paper discusses the issue of fault-tolerance in
                   distributed computer systems with tens or hundreds of
                   thousands of diskless processor units. Such systems, like the
                   IBM Blue Gene/L, are predicted to be deployed in the next
                   five to ten years. Since a 100,000-processor system is going
                   to be less reliable, scientific applications need to be able
                   to recover from occurring failures more efficiently. In this
                   paper, we adapt the present technique of diskless
                   checkpointing to such huge distributed systems in order to
                   equip existing scientific algorithms with super-scalable
                   fault-tolerance. First, we discuss the method of diskless
                   checkpointing, then we adapt this technique to super-scale
                   architectures and finally we present results from an
                   implementation of the Fast Fourier Transform that uses the
                   adapted technique to achieve super-scale fault-tolerance."
}
@conference{engelmann02distributed,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and George A. (Al) Geist",
  title         = "Distributed Peer-to-Peer Control in {Harness}",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://www.science.uva.nl/events/ICCS2002}{$2^{nd}$
                   International Conference on Computational Science (ICCS)
                   2002}, Part II: Workshop on Global and Collaborative
                   Computing",
  volume        = "2330",
  pages         = "720--727",
  month         = apr # "~21-24, ",
  year          = "2002",
  address       = "Amsterdam, The Netherlands",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "3-540-43593-X",
  issn          = "0302-9743",
  doi           = "http://www.springerlink.com/content/l537ujfwt8yta2dp",
  url           = "http://www.christian-engelmann.info/publications/engelmann02distributed.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann02distributed.ppt.pdf",
  abstract      = "Harness is an adaptable fault-tolerant virtual machine
                   environment for next-generation heterogeneous distributed
                   computing developed as a follow on to PVM. It additionally
                   enables the assembly of applications from plug-ins and
                   provides fault-tolerance. This work describes the distributed
                   control, which manages global state replication to ensure a
                   high-availability of service. Group communication services
                   achieve an agreement on an initial global state and a linear
                   history of global state changes at all members of the
                   distributed virtual machine. This global state is replicated
                   to all members to easily recover from single, multiple and
                   cascaded faults. A peer-to-peer ring network architecture and
                   tunable multi-point failure conditions provide heterogeneity
                   and scalability. Finally, the integration of the distributed
                   control into the multi-threaded kernel architecture of
                   Harness offers a fault-tolerant global state database service
                   for plug-ins and applications."
}
@misc{fiala11detection,
  author        = "David Fiala
                   and Frank Mueller
                   and Christian Engelmann
                   and Rolf Riesen
                   and Kurt Ferreira",
  title         = "Detection and Correction of Silent Data Corruption for
                   Large-Scale High-Performance Computing",
  month         = nov # "~12-18, ",
  year          = "2011",
  howpublished  = "{Poster at the \href{http://sc11.supercomputing.org}
                   {24th IEEE/ACM International Conference on High Performance
                    Computing, Networking, Storage and Analysis (SC) 2011},
                   Seattle, WA, USA}",
  url           = "",
  abstract      = "Faults have become the norm rather than the exception for
                   high-end computing on clusters with 10s/100s of thousands of
                   cores. Exacerbating this situation, some of these faults will
                   not be detected, manifesting themselves as silent errors that
                   will corrupt memory while applications continue to operate and
                   report incorrect results. This poster introduces RedMPI, an
                   MPI library which resides in the MPI profiling layer. RedMPI
                   is capable of both online detection and correction of soft
                   errors that occur in MPI applications without requiring any
                   modifications to the application source. By providing
                   redundancy, RedMPI is capable of transparently detecting
                   corrupt messages from MPI processes that become faulted during
                   execution. Furthermore, with triple redundancy RedMPI
                   additionally ``votes'' out MPI messages of a faulted process
                   by replacing corrupted results with corrected results from
                   unfaulted processes. We present an experimental evaluation of
                   RedMPI on an assortment of applications to demonstrate the
                   effectiveness of this approach."
}
@misc{fiala11tunable2,
  author        = "David Fiala
                   and Kurt Ferreira
                   and Frank Mueller
                   and Christian Engelmann",
  title         = "A Tunable, Software-based DRAM Error Detection and Correction
                   Library for HPC",
  month         = nov # "~12-18, ",
  year          = "2011",
  howpublished  = "{Poster at the \href{http://sc11.supercomputing.org}
                   {24th IEEE/ACM International Conference on High Performance
                    Computing, Networking, Storage and Analysis (SC) 2011},
                   Seattle, WA, USA}",
  url           = "",
  abstract      = "Proposed exascale systems will present a number of
                   considerable resiliency challenges. In particular, DRAM
                   soft-errors, or bit-flips, are expected to greatly increase
                   due to the increased memory density of these systems. Current
                   hardware-based fault-tolerance methods will be unsuitable for
                   addressing the expected soft error frequency rate. As a
                   result, additional software will be needed to address this
                   challenge. In this paper we introduce LIBSDC, a tunable,
                   transparent silent data corruption detection and correction
                   library for HPC applications. LIBSDC provides comprehensive
                   SDC protection for program memory by implementing on-demand
                   page integrity verification by utilizing the MMU. Experimental 
                   benchmarks with Mantevo HPCCG show that once tuned, LIBSDC is
                   able to achieve SDC protection with less than 100\% overhead
                   of resources."
}
@misc{scott09tunable2,
  author        = "Stephen L. Scott
                   and Christian Engelmann
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Anand Tikotekar
                   and George Ostrouchov
                   and Chokchai (Box) Leangsuksun
                   and Nichamon Naksinehaboon
                   and Raja Nassar
                   and Mihaela Paun
                   and Frank Mueller
                   and Chao Wang
                   and Arun B. Nagarajan
                   and Jyothish Varma",
  title         = "A Tunable Holistic Resiliency Approach for High-Performance
                   Computing Systems",
  month         = aug # "~12-14, ",
  year          = "2009",
  howpublished  = "{Poster at the
                   \href{http://institute.lanl.gov/resilience/conferences/2009}
                   {National HPC Workshop on Resilience 2009}, Arlington, VA,
                   USA}",
  url           = "http://www.christian-engelmann.info/publications/scott09tunable2.pdf",
  abstract      = "In order to address anticipated high failure rates,
                   resiliency characteristics have become an urgent priority for
                   next-generation extreme-scale high-performance computing
                   (HPC) systems. This poster describes our past and ongoing
                   efforts in novel fault resilience technologies for HPC.
                   Presented work includes proactive fault resilience
                   techniques, system and application reliability models and
                   analyses, failure prediction, transparent process- and
                   virtual-machine-level migration, and trade-off models for
                   combining preemptive migration with checkpoint/restart. This
                   poster summarizes our work and puts all individual
                   technologies into context with a proposed holistic fault
                   resilience framework."
}
@misc{scott09systemlevel,
  author        = "Stephen L. Scott
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Anand Tikotekar
                   and Christian Engelmann
                   and Hong H. Ong",
  title         = "System-level Virtualization for for High-Performance
                   Computing",
  month         = aug # "~12-14, ",
  year          = "2009",
  howpublished  = "{Poster at the
                   \href{http://institute.lanl.gov/resilience/conferences/2009}
                   {National HPC Workshop on Resilience 2009}, Arlington, VA,
                   USA}",
  url           = "http://www.christian-engelmann.info/publications/scott09systemlevel.pdf",
  abstract      = "This poster summarizes our past and ongoing research and
                   development efforts in novel system software solutions for
                   providing a virtual system environment (VSE) for
                   next-generation extreme-scale high-performance computing
                   (HPC) systems and beyond. The poster showcases results of
                   developed proof-of-concept implementations and performed
                   theoretical analyses, outlines planned research and
                   development activities, and presents respective initial
                   results."
}
@misc{scott09tunable,
  author        = "Stephen L. Scott
                   and Christian Engelmann
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Anand Tikotekar
                   and George Ostrouchov
                   and Chokchai (Box) Leangsuksun
                   and Nichamon Naksinehaboon
                   and Raja Nassar
                   and Mihaela Paun
                   and Frank Mueller
                   and Chao Wang
                   and Arun B. Nagarajan
                   and Jyothish Varma",
  title         = "A Tunable Holistic Resiliency Approach for High-Performance
                   Computing Systems",
  month         = feb # "~14-18, ",
  year          = "2009",
  howpublished  = "{Poster at the \href{http://ppopp09.rice.edu}{$14^{th}$ ACM
                   SIGPLAN Symposium on Principles and Practice of Parallel
                   Programming (PPoPP) 2009}, Raleigh, NC, USA}",
  url           = "http://www.christian-engelmann.info/publications/scott09tunable.pdf",
  abstract      = "In order to address anticipated high failure rates,
                   resiliency characteristics have become an urgent priority for
                   next-generation extreme-scale high-performance computing
                   (HPC) systems. This poster describes our past and ongoing
                   efforts in novel fault resilience technologies for HPC.
                   Presented work includes proactive fault resilience
                   techniques, system and application reliability models and
                   analyses, failure prediction, transparent process- and
                   virtual-machine-level migration, and trade-off models for
                   combining preemptive migration with checkpoint/restart. This
                   poster summarizes our work and puts all individual
                   technologies into context with a proposed holistic fault
                   resilience framework."
}
@misc{geist08harness,
  author        = "George A. (Al) Geist
                   and Christian Engelmann
                   and Jack J. Dongarra
                   and George Bosilca
                   and Magdalena M. S\l{}awi\'nska
                   and Jaros\l{}aw K. S\l{}awi\'nski",
  title         = "The {Harness} Workbench: {U}nified and Adaptive Access to
                   Diverse High-Performance Computing Platforms",
  month         = mar # "~30 - " # apr # "~5, ",
  year          = "2008",
  howpublished  = "{Poster at the \href{http://www.hpcsw.org}{$1^{st}$
                   High-Performance Computer Science Week (HPCSW) 2008}, Denver,
                   CO, USA}",
  url           = "http://www.christian-engelmann.info/publications/geist08harness.pdf",
  abstract      = "This poster summarizes our past and ongoing research and
                   development efforts in novel software solutions for providing
                   unified and adaptive access to diverse high-performance
                   computing (HPC) platforms. The poster showcases developed
                   proof-of-concept implementations of tools and mechanisms that
                   simplify scientific application development and deployment
                   tasks, such that only minimal adaptation is needed when
                   moving from one HPC system to another or after HPC system
                   upgrades."
}
@misc{scott08resiliency,
  author        = "Stephen L. Scott
                   and Christian Engelmann
                   and Hong H. Ong
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Anand Tikotekar
                   and George Ostrouchov
                   and Chokchai (Box) Leangsuksun
                   and Nichamon Naksinehaboon
                   and Raja Nassar
                   and Mihaela Paun
                   and Frank Mueller
                   and Chao Wang
                   and Arun B. Nagarajan
                   and Jyothish Varma
                   and Xubin (Ben) He
                   and Li Ou
                   and Xin Chen",
  title         = "Resiliency for High-Performance Computing Systems",
  month         = mar # "~30 - " # apr # "~5, ",
  year          = "2008",
  howpublished  = "{Poster at the \href{http://www.hpcsw.org}{$1^{st}$
                   High-Performance Computer Science Week (HPCSW) 2008}, Denver,
                   CO, USA}",
  url           = "http://www.christian-engelmann.info/publications/scott08resiliency.pdf",
  abstract      = "This poster summarizes our past and ongoing research and
                   development efforts in novel system software solutions for
                   providing high-level reliability, availability and
                   serviceability (RAS) for next-generation extreme-scale
                   high-performance computing (HPC) systems and beyond. The
                   poster showcases results of developed proof-of-concept
                   implementations and performed theoretical analyses, outlines
                   planned research and development activities, and presents
                   respective initial results."
}
@misc{scott08systemlevel,
  author        = "Stephen L. Scott
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Anand Tikotekar
                   and Christian Engelmann
                   and Hong H. Ong",
  title         = "System-level Virtualization for for High-Performance
                   Computing",
  month         = mar # "~30 - " # apr # "~5, ",
  year          = "2008",
  howpublished  = "{Poster at the \href{http://www.hpcsw.org}{$1^{st}$
                   High-Performance Computer Science Week (HPCSW) 2008}, Denver,
                   CO, USA}",
  url           = "http://www.christian-engelmann.info/publications/scott08systemlevel.pdf",
  abstract      = "This poster summarizes our past and ongoing research and
                   development efforts in novel system software solutions for
                   providing a virtual system environment (VSE) for
                   next-generation extreme-scale high-performance computing
                   (HPC) systems and beyond. The poster showcases results of
                   developed proof-of-concept implementations and performed
                   theoretical analyses, outlines planned research and
                   development activities, and presents respective initial
                   results."
}
@misc{engelmann13hardware,
  author        = "Christian Engelmann
                   and Thomas Naughton",
  title         = "A Hardware/Software Performance/Resilience/Power Co-Design
                   Tool for Extreme-scale Computing",
  howpublished  = "Whitepaper submitted to the U.S. Department of Energy's
                   \href{http://hpc.pnl.gov/modsim/2013}{Workshop on Modeling
                   & Simulation of Exascale Systems & Applications (ModSim)
                   2013}",
  month         = sep # "~18-19, ",
  year          = "2013",
  address       = "Seattle, WA, USA",
  url           = "http://www.christian-engelmann.info/publications/engelmann13hardware.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann13hardware.ppt.pdf",
  abstract      = "xSim is a simulation-based performance investigation toolkit
                   that permits running high-performance computing (HPC)
                   applications in a controlled environment with millions of
                   concurrent execution threads, while observing application
                   performance in a simulated extreme-scale system for
                   hardware/software co-design. The presented work details newly
                   developed features for xSim that permit the injection of MPI
                   process failures, the propagation/detection/notification of
                   such failures within the simulation, and their handling using
                   application-level checkpoint/restart. The newly added features
                   also offer user-level failure mitigation (ULFM) extensions
                   at the simulated MPI layer to support algorithm-based fault
                   tolerance (ABFT). The presented solution permits investigating
                   performance under failure and failure handling of
                   checkpoint/restart and ABFT solutions. The newly enhanced xSim
                   is the very first performance tool that supports these
                   capabilities."
}
@misc{snir13addressing,
  author        = "Marc Snir and
                   and Robert W. Wisniewski
                   and Jacob A. Abraham
                   and Sarita V. Adve
                   and Saurabh Bagchi
                   and Pavan Balaji
                   and Bill Carlson
                   and Andrew A. Chien
                   and Pedro Diniz
                   and Christian Engelmann
                   and Rinku Gupta
                   and Fred Johnson
                   and Jim Belak
                   and Pradip Bose
                   and Franck Cappello
                   and Paul Coteus
                   and Nathan A. Debardeleben
                   and Mattan Erez
                   and Saverio Fazzari
                   and Al Geist
                   and Sriram Krishnamoorthy
                   and Sven Leyffer
                   and Dean Liberty
                   and Subhasish Mitra
                   and Todd Munson
                   and Rob Schreiber
                   and Jon Stearley
                   and Eric Van Hensbergen",
  title         = "Addressing Failures in Exascale Computing",
  howpublished  = "Workshop report",
  month         = aug # "~4-11, ",
  year          = "2013",
  address       = "Park City, UT, USA",
  url           = "http://www.christian-engelmann.info/publications/snir13addressing.pdf"
}
@misc{geist12department,
  author        = "Al Geist
                   and Bob Lucas
                   and Marc Snir
                   and Shekhar Borkar
                   and Eric Roman
                   and Mootaz Elnozahy
                   and Bert Still
                   and Andrew Chien
                   and Robert Clay
                   and John Wu
                   and Christian Engelmann
                   and Nathan DeBardeleben
                   and Rob Ross
                   and Larry Kaplan
                   and Martin Schulz
                   and Mike Heroux
                   and Sriram Krishnamoorthy
                   and Lucy Nowell
                   and Abhinav Vishnu
                   and Lee-Ann Talley",
  title         = "U.S. Department of Energy Fault Management Workshop",
  howpublished  = "Workshop report submitted to the U.S. Department of Energy",
  month         = jun # "~6, ",
  year          = "2012",
  address       = "Baltimore, MA, USA",
  url           = "http://www.christian-engelmann.info/publications/geist12department.pdf",
  abstract      = "A Department of Energy (DOE) Fault Management Workshop was
                   held on June 6, 2012 at the BWI Airport Marriot hotel in
                   Maryland. The goals of this workshop were to: 1. Describe
                   the required HPC resilience for critical DOE mission needs;
                   2. Detail what HPC resilience research is already being done
                   at the DOE national laboratories and is expected to be done
                   by industry or other groups; 3. Determine what fault
                   management research is a priority for DOE's Office of
                   Science and National Nuclear Security Administration
                   (NNSA) over the next five years; 4. Develop a roadmap for
                   getting the necessary research accomplished in the timeframe
                   when it will be needed by the large computing facilities
                   across DOE."
}
@misc{engelmann12performance,
  author        = "Christian Engelmann
                   and Thomas Naughton",
  title         = "A Performance/Resilience/Power Co-design Tool for
                   Extreme-scale High-Performance Computing",
  howpublished  = "Whitepaper submitted to the U.S. Department of Energy's
                   \href{http://hpc.pnl.gov/modsim/2012}{Workshop on Modeling
                   & Simulation of Exascale Systems & Applications (ModSim)
                   2012}",
  month         = aug # "~9-10, ",
  year          = "2012",
  address       = "Seattle, WA, USA",
  url           = "http://www.christian-engelmann.info/publications/engelmann12performance.pdf",
  abstract      = "Performance, resilience and power consumption are key HPC
                   system design factors that are highly interde-pendent. To
                   enable extreme-scale computing it is essential to perform
                   HPC hardware/software co-design that identifies the
                   cost/benefit trade-off between these design factors for
                   potential future architecture choices. The proposed research
                   and development aims at developing an HPC hardware/software
                   co-design toolkit for evaluating the
                   resilience/power/performance cost/benefit trade-off of
                   future architecture choices. The approach focuses on
                   extending a simulation-based performance investigation
                   toolkit with advanced resilience and power modeling and
                   simulation features, such as (i) fault injection mechanisms,
                   (ii) fault propagation, isolation, and detection models, (i)
                   fault avoidance, masking, and recovery simulation, and (iv)
                   power consumption models."
}
@misc{engelmann12dynamic,
  author        = "Christian Engelmann
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Frank Mueller",
  title         = "Dynamic Self-Aware Runtime Software for Exascale Systems",
  howpublished  = "Whitepaper submitted to the U.S. Department of Energy's
                   Exascale Operating Systems and Runtime Technical Council",
  month         = jul,
  year          = "2012",
  url           = "http://www.christian-engelmann.info/publications/engelmann12dynamic.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann12dynamic.ppt.pdf",
  abstract      = "At exascale, the power consumption, resilience, and load
                   balancing constraints, especially their dynamic nature and
                   interdependence, and the scale of the system require a
                   radical change in future high-performance computing (HPC)
                   operating systems and runtimes (OS/Rs). In contrast to the
                   existing static OS/R solutions, an exascale OS/R is needed
                   that is aware of the dynamically changing resources,
                   constraints, and application needs, and that is able to
                   autonomously coordinate (sometimes conflicting) responses
                   to different changes in the system, simultaneously and at
                   scale. To provide awareness and autonomic management, a
                   novel, scalable and self-aware OS/R is needed that becomes
                   the brains of the entire X-stack. It dynamically analyzes
                   past, current, and future system status and application
                   needs. It optimizes system usage by scheduling, migrating,
                   and restarting tasks within and across nodes as needed to
                   deal with multi-dimensional constraints, such as power
                   consumption, permanent and transient faults, resource
                   degradation, heterogeneity, data locality, and load balance."
}
@misc{debardeleben09high-end,
  author        = "Nathan DeBardeleben
                   and James Laros
                   and John T. Daly
                   and Stephen L. Scott
                   and Christian Engelmann
                   and Bill Harrod",
  title         = "High-End Computing Resilience: {Analysis} of Issues
                   Facing the {HEC} Community and Path-Forward for
                   Research and Development",
  howpublished  = "Whitepaper submitted to the U.S. National Science Foundation's High-end Computing Program",
  month         = dec,
  year          = "2009",
  url           = "http://www.christian-engelmann.info/publications/debardeleben09high-end.pdf"
}
@techreport{fiala12detection,
  author        = "David Fiala
                   and Frank Mueller
                   and Christian Engelmann
                   and Kurt Ferreira
                   and Ron Brightwell
                   and Rolf Riesen",
  title         = "Detection and Correction of Silent Data Corruption for Large-Scale High-Performance Computing",
  institution   = "Oak Ridge National Laboratory",
  number        = "ORNL/TM-2012/227",
  address       = "Oak Ridge, TN, USA",
  month         = jun,
  year          = "2012",
  url           = "http://www.christian-engelmann.info/publications/fiala12detection.pdf",
  abstract      = "Faults have become the norm rather than the exception for
                   high-end computing on clusters with 10s/100s of thousands
                   of cores. Exacerbating this situation, some of these faults
                   remain undetected, manifesting themselves as silent errors
                   that corrupt memory while applications continue to operate
                   and report incorrect results.
                   This paper studies the potential for redundancy to both
                   detect and correct soft errors in MPI message-passing
                   applications. Our study investigates the challenges inherent
                   to detecting soft errors within MPI application while
                   providing transparent MPI redundancy. By assuming a model
                   wherein corruption in application data manifests itself by
                   producing differing MPI message data between replicas, we
                   study the best suited protocols for detecting and correcting
                   MPI data that is the result of corruption.
                   To experimentally validate our proposed detection and
                   correction protocols, we introduce RedMPI, an MPI library
                   which resides in the MPI profiling layer. RedMPI is capable
                   of both online detection and correction of soft errors that
                   occur in MPI applications without requiring any modifications
                   to the application source by utilizing either double or
                   triple redundancy.
                   Our results indicate that our most efficient consistency
                   protocol can successfully protect applications experiencing
                   even high rates of silent data corruption with runtime
                   overheads between 0\% and 30\% as compared to unprotected
                   applications without redundancy.
                   Using our fault injector within RedMPI, we observe that even
                   a single soft error can have profound effects on running
                   applications, causing a cascading pattern of corruption in
                   most cases causes that spreads to all other processes.
                   RedMPI's protection has been shown to successfully mitigate
                   the effects of soft errors while allowing applications to
                   complete with correct results even in the face of errors."
}
@techreport{wang10hybrid,
  author        = "Chao Wang
                   and Frank Mueller
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Hybrid Full/Incremental Checkpoint/Restart for {MPI} Jobs in
                   {HPC} Environments",
  institution   = "Oak Ridge National Laboratory",
  number        = "ORNL/TM-2010/162",
  address       = "Oak Ridge, TN, USA",
  month         = aug,
  year          = "2010",
  url           = "http://www.christian-engelmann.info/publications/wang10hybrid.pdf",
  abstract      = "As the number of cores in high-performance computing
                   environments keeps increasing, faults are becoming common
                   place. Checkpointing addresses such faults but captures
                   full process images even though only a subset of the
                   process image changes between checkpoints.
                   We have designed a high-performance hybrid disk-based
                   full/incremental checkpointing technique for MPI tasks
                   to capture only data changed since the last checkpoint.
                   Our implementation integrates new BLCR and LAM/MPI
                   features that complement traditional full checkpoints.
                   This results in significantly reduced checkpoint sizes
                   and overheads with only moderate increases in restart
                   overhead. After accounting for cost and savings, benefits
                   due to incremental checkpoints significantly outweigh the
                   loss on restart operations.
                   Experiments in a cluster with the NAS Parallel Benchmark
                   suite and mpiBLAST indicate that savings due to replacing
                   full checkpoints with incremental ones average 16.64
                   seconds while restore overhead amounts to just 1.17
                   seconds. These savings increase with the frequency of
                   incremental checkpoints. Overall, our novel hybrid
                   full/incremental checkpointing is superior to prior
                   non-hybrid techniques."
}
@techreport{wang10proactive,
  author        = "Chao Wang
                   and Frank Mueller
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Proactive Process-Level Live Migration and Back Migration in
                   {HPC} Environments",
  institution   = "Oak Ridge National Laboratory",
  number        = "ORNL/TM-2010/161",
  address       = "Oak Ridge, TN, USA",
  month         = aug,
  year          = "2010",
  url           = "http://www.christian-engelmann.info/publications/wang10proactive.pdf",
  abstract      = "As the number of nodes in high-performance computing
                   environments keeps increasing, faults are becoming common
                   place. Reactive fault tolerance (FT) often does not scale
                   due to massive I/O requirements and relies on manual job
                   resubmission. This work complements reactive with proactive
                   FT at the process level. Through health monitoring, a subset
                   of node failures can be anticipated when one's health
                   deteriorates. A novel process-level live migration mechanism
                   suppor ts continued execution of applications during much of
                   processes migration. This scheme is integrated into an MPI
                   execution environment to transparently sustain
                   health-inflicted node failures, which eradicates the need to
                   restart and requeue MPI jobs. Experiments indicate that 1-6.5
                   seconds of prior warning are required to successfully trigger
                   live process migration while similar operating system
                   virtualization mechanisms require 13-24 seconds. This
                   self-healing approach complements reactive FT by nearly
                   cutting the number of checkpoints in half when 70\% of
                   the faults are handled proactively. The work also provides
                   a novel back migration approach to eliminate load imbalance
                   or bottlenecks caused by migrated tasks. Experiments indicate
                   the larger the amount of outstanding execution, the higher
                   the benefit due to back migration will be."
}
@conference{engelmann14supporting,
  author        = "Christian Engelmann",
  title         = "Supporting the Development of Resilient Message Passing
                   Applications using Simulation",
  month         = sep # "~28 - " # oct # "~1, ",
  year          = "2014",
  howpublished  = "Invited talk at the
                   \href{http://www.dagstuhl.de/en/program/calendar/semhp/?semnr=14402}
                   {Dagstuhl Seminar on Resilience in Exascale Computing},
                   Schloss Dagstuhl, Wadern, Germany",
  url           = "http://www.christian-engelmann.info/publications/engelmann14supporting.ppt.pdf",
  abstract      = "An emerging aspect of high-performance computing (HPC)
                   hardware/software co-design is investigating performance
                   under failure. The presented work extends the Extreme-scale
                   Simulator (xSim), which was designed for evaluating the
                   performance of message passing interface (MPI) applications
                   on future HPC architectures, with fault-tolerant MPI
                   extensions proposed by the MPI Fault Tolerance Working Group.
                   xSim permits running MPI applications with millions of
                   concurrent MPI ranks, while observing application performance
                   in a simulated extreme-scale system using a lightweight
                   parallel discrete event simulation. The newly added features
                   offer user-level failure mitigation (ULFM) extensions at the
                   simulated MPI layer to support algorithm-based fault tolerance
                   (ABFT). The presented solution permits investigating
                   performance under failure and failure handling of ABFT
                   solutions. The newly enhanced xSim is the very first
                   performance tool that supports ULFM and ABFT."
}
@misc{engelmann13resilience,
  author        = "Christian Engelmann",
  title         = "Resilience Challenges and Solutions for Extreme-Scale
                   Supercomputing",
  month         = sep # "~3, ",
  year          = "2013",
  howpublished  = "{Invited talk at the Technical University of Dresden,
                    Dresden, Germany}",
  url           = "http://www.christian-engelmann.info/publications/engelmann13resilience.ppt.pdf",
  abstract      = "With the recent deployment of the 18 PFlop/s Titan
                   supercomputer and the exascale roadmap targeting 100, 300,
                   and eventually 1,000 PFlop/s by 2022, Oak Ridge National
                   Laboratory is at the forefront of scientific capability
                   computing. The path to exascale computing poses several
                   research challenges related to power, performance,
                   resilience, productivity, programmability, data movement,
                   and data management. Resilience, i.e., providing efficiency
                   and correctness in the presence of faults, is one of the
                   most important exascale computer science challenges as
                   systems scale up in component count (100,000-1,000,000
                   nodes with 1,000-10,000 cores per node by 2022) and
                   component reliability decreases (7 nm technology with
                   near-threshold voltage operation by 2022). This talk
                   provides an overview of recent and ongoing resilience
                   research and development activities at Oak Ridge National
                   Laboratory in advanced checkpoint storage architectures,
                   process-level incremental checkpoint/restart, proactive
                   fault tolerance using prediction-triggered process or
                   virtual machine migration, MPI process-level software
                   redundancy, and soft-error injection tools to study the
                   vulnerability of science applications and of CMOS logic
                   in processors and memory."
}
@misc{engelmann12fault,
  author        = "Christian Engelmann",
  title         = "Fault Tolerance Session",
  month         = oct # "~16-17, ",
  year          = "2012",
  howpublished  = "{Invited talk at the
                    \href{http://www.aanmelder.nl/exachallenge}
                    {The ExaChallenge Symposium}, Dublin, Ireland}",
  url           = "http://www.christian-engelmann.info/publications/engelmann12fault.ppt.pdf"
}
@misc{engelmann12high-end,
  author        = "Christian Engelmann",
  title         = "High-End Computing Resilience: Analysis of Issues Facing the
                   HEC Community and Path Forward for Research and Development",
  month         = aug # "~4-11, ",
  year          = "2012",
  howpublished  = "{Invited talk at the Argonne National Laboratory (ANL)
                    Institute of Computing in Science (ICiS)
                    \href{http://www.icis.anl.gov/programs/summer2012-4b}
                    {Summer Workshop Week on Addressing Failures in Exascale
                     Computing}, Park City, UT, USA}",
  url           = "http://www.christian-engelmann.info/publications/engelmann12high-end.ppt.pdf",
  abstract      = "The path to exascale computing poses several research
                   challenges related to power, performance, resilience,
                   productivity, programmability, data movement, and data
                   management. Resilience, i.e., providing efficiency and
                   correctness in the presence of faults, is one of the most
                   important exascale computer science challenges as systems
                   scale up in component count (100,000-1,000,000 nodes with
                   1,000-10,000 cores per node by 2020) and component
                   reliability decreases (7 nm technology with near-threshold
                   voltage operation by 2020). To provide input for a
                   discussion of future needs in resilience research,
                   development, and standards work, this talk gives a brief
                   summary of the outcomes from the National HPC Workshop on
                   Resilience, held in Arlington, VA, USA on August 12-14,
                   2009."
}
@misc{engelmann12resilience,
  author        = "Christian Engelmann",
  title         = "Resilience for Permanent, Transient, and Undetected Errors",
  month         = mar # "~12-15, ",
  year          = "2012",
  howpublished  = "{Invited talk at the
                    \href{http://www.cs.sandia.gov/Conferences/SOS16}
                    {$16^{th}$ Workshop on Distributed Supercomputing (SOS)
                     2012}, Santa Barbara, CA, USA}",
  url           = "http://www.christian-engelmann.info/publications/engelmann12resilience.ppt.pdf",
  abstract      = "With the ongoing deployment of 10-20 PFlop/s supercomputers
                   and the exascale roadmap targeting 100, 300, and eventually
                   1,000 PFlop/s by 2020, the path to exascale computing poses
                   several research challenges related to power, performance,
                   resilience, productivity, programmability, data movement,
                   and data management. Resilience, i.e., providing efficiency
                   and correctness in the presence of faults, is one of the
                   most important exascale computer science challenges as
                   systems scale up in component count (100,000-1,000,000
                   nodes with 1,000-10,000 cores per node by 2020) and
                   component reliability decreases (7 nm technology with
                   near-threshold voltage operation by 2020). This talk
                   provides an overview of recent and ongoing resilience
                   research and development activities at Oak Ridge National
                   Laboratory, and of future needs in resilience research,
                   development, and standards work."
}
@misc{engelmann12scaling,
  author        = "Christian Engelmann",
  title         = "Scaling To A Million Cores And Beyond: A Basic Understanding
                   Of The Challenges Ahead On The Road To Exascale",
  month         = jan # "~24, ",
  year          = "2012",
  howpublished  = "{Invited talk at the
                   \href{https://researcher.ibm.com/researcher/view_page.php?id=2580}
                   {$1^{st}$ International Workshop on Extreme Scale Parallel
                   Architectures and Systems (ESPAS) 2012}, in conjunction with
                   the \href{http://www.hipeac.net/conference/paris}{$7^{th}$
                   International Conference on High-Performance and Embedded
                   Architectures and Compilers (HiPEAC) 2012}, Paris France}",
  url           = "http://www.christian-engelmann.info/publications/engelmann12scaling.ppt.pdf",
  abstract      = "On the road toward multi-petascale and exascale HPC, the
                   trend in architecture goes clearly in only one direction.
                   HPC systems will dramatically scale up in compute node and
                   processor core counts. By 2020, an exascale system may have
                   up to 1,000,000 compute nodes with 1,000 cores per node. The
                   substantial growth in concurrency causes parallel application
                   scalability issues due to sequential application parts,
                   synchronizing communication, and other bottlenecks.
                   Investigating parallel algorithm performance properties at
                   this scale and with these architectural properties for HPC
                   hardware/software co-design is crucial to enable
                   extreme-scale computing. The presented work utilizes the
                   Extreme-scale Simulator (xSim) performance investigation
                   toolkit to identify the scaling characteristics of a simple
                   Monte Carlo algorithm from 1 to 16 million MPI processes on
                   different multi-core architecture choices. The results show
                   the limitations of strong scaling and the negative impact of
                   employing more but less powerful cores for energy savings."
}
@misc{engelmann11resilient,
  author        = "Christian Engelmann",
  title         = "Resilient Software for ExaScale Computing",
  month         = nov # "~17, ",
  year          = "2011",
  howpublished  = "{Invited talk at the Birds of a Feather Session on Resilient
                   Software for ExaScale Computing at the
                   \href{http://sc11.supercomputing.org}
                   {24th IEEE/ACM International Conference on High Performance
                    Computing, Networking, Storage and Analysis (SC) 2011},
                   Seattle, WA, USA}",
  url           = "http://www.christian-engelmann.info/publications/engelmann11resilient.ppt.pdf",
  abstract      = "ExaScale computing systems will likely consist of millions
                   of cores executing applications with billions of threads,
                   based on 14nm or less CMOS technology, according to the
                   ITRS roadmap. Processing elements built on this technology,
                   coupled with dynamic power management will exhibit high
                   variability in performance, between cores and across
                   different runs. Even worse, preliminary figures indicates
                   that on average about every couple of minutes - at least -
                   something in the system will break. Traditional
                   checkpointing strategies are unlikely to work, given the
                   time it will take to save the huge quantities of data
                   combined with the fact that they will need to be restored
                   frequently. This BoF wants to investigate resilient
                   software: software that is able to survive failing
                   hardware and continue to run, without minimal performance
                   impact. Furthermore, we may also discuss tradeoffs between
                   rerunning the application and the cost of instrumentation
                   to deal with resilience."
}
@misc{engelmann11resilience,
  author        = "Christian Engelmann",
  title         = "Resilience and Hardware/Software Co-design for Extreme-Scale
                   Supercomputing",
  month         = jul # "~27, ",
  year          = "2011",
  howpublished  = "{Seminar at the \href{http://www.bsc.es}{Barcelona
                   Supercomputing Center}, Barcelona, Spain}",
  url           = "http://www.christian-engelmann.info/publications/engelmann11resilience.ppt.pdf",
  abstract      = "Oak Ridge National Laboratory (ORNL) provides the most
                   powerful high-performance computing (HPC) resources in the
                   world for open scientific research. Jaguar, a 224,162-core
                   Cray XT5 with a LINPACK performance of 1.759 PFlop/s, for
                   example, is the world's 3rd fastest supercomputer. 80\% of
                   its resources are allocated through a reviewed process to
                   address the most challenging scientific problems in climate
                   modeling, renewable energy, materials science, fusion and
                   other areas. ORNL's Computer Science and Mathematics Division
                   performs computer science and mathematics research to
                   increase supercomputer efficiency and application scientist
                   productivity while accelerating time to solution for
                   scientific breakthroughs. This talk details recent research
                   advancements at ORNL in two areas: (1) resilience and (2)
                   hardware/software co-design for extreme-scale supercomputing.
                   Both are essential on the road toward exa-scale HPC systems
                   with millions-to-billions of cores. Due to the expected
                   drastic increase in scale, the corresponding decrease in
                   system mean-time to interrupt warrants a rethinking of the
                   traditional checkpoint/restart approach for HPC resilience.
                   New concepts discussed in this talk range from preventative
                   measures, such as task migration based on fault prediction,
                   to more aggressive fault masking, such as various levels of
                   redundancy. Further, the expected drastic increase in task
                   parallelism requires redesigning algorithms to avoid the
                   consequences of Amdahl's law at extreme scale. As million-way
                   task parallel systems don't exist yet, this talk discusses a
                   lightweight system simulation approach for performance
                   estimation of algorithms at scale."
}
@misc{engelmann10scalable,
  author        = "Christian Engelmann",
  title         = "Scalable HPC System Monitoring",
  month         = oct # "~13, ",
  year          = "2010",
  howpublished  = "{Invited talk at the $3^{rd}$ HPC Resiliency Summit: Workshop
                   on Resiliency for Petascale HPC 2010, in conjunction with the
                   \href{http://www.lanl.gov/conferences/lacss/2010}{$3^{rd}$
                   Los Alamos Computer Science Symposium (LACSS) 2010}, Santa
                   Fe, NM, USA}",
  url           = "http://www.christian-engelmann.info/publications/engelmann10scalable.ppt.pdf",
  abstract      = "We present a monitoring system for large-scale parallel and
                   distributed computing environments that allows to trade-off
                   accuracy in a tunable fashion to gain scalability without
                   compromising fidelity. The approach relies on classifying
                   each gathered monitoring metric based on individual needs
                   and on aggregating messages containing classes of individual
                   monitoring metrics using a tree-based overlay network. The
                   MRNet-based prototype is able to significantly reduce the
                   amount of gathered and stored monitoring data, e.g., by a
                   factor of ~56 in comparison to the Ganglia distributed
                   monitoring system. A simple scaling study reveals, however,
                   that further efforts are needed in reducing the amount of
                   data to monitor future-generation extreme-scale systems with
                   up to 1,000,000 nodes. The implemented solution did not had
                   a measurable performance impact as the 32-node test system
                   did not produce enough monitoring data to interfere with
                   running applications."
}
@misc{engelmann10beyond,
  author        = "Christian Engelmann",
  title         = "Beyond Application-Level Checkpoint/Restart - {Advanced}
                   Software Approaches for Fault Resilience",
  month         = sep # "~6, ",
  year          = "2010",
  howpublished  = "{Talk at the
                   \href{http://www.speedup.ch/workshops/w39_2010.html}
                   {$39^{th}$ SPEEDUP Workshop on High Performance Computing},
                   Zurich, Switzerland}",
  url           = "http://www.christian-engelmann.info/publications/engelmann10beyond.ppt.pdf"
}
@misc{engelmann10reliability,
  author        = "Christian Engelmann and
                   Stephen L. Scott",
  title         = "Reliability, Availability, and Serviceability ({RAS}) for
                   Petascale High-End Computing and Beyond",
  month         = jun # "~22, ",
  year          = "2010",
  howpublished  = "{Talk at the \href{http://www.usenix.org/events/fastos10}
                   {Forum to Address Scalable Technology for Runtime and
                   Operating Systems (FAST-OS) Workshop}, in conjunction with
                   the \href{http://www.usenix.org/events/confweek10}{USENIX
                   Federated Conferences Week (USENIX) 2010}, Boston MA, USA}",
  url           = "http://www.christian-engelmann.info/publications/engelmann10reliability.ppt.pdf",
  abstract      = "This project aims at scalable technologies for providing
                   high-level RAS for next-generation petascale scientific
                   high-performance computing (HPC) resources and beyond as
                   outlined by the U.S. Department of Energy (DOE) Forum to
                   Address Scalable Technology for Runtime and Operating
                   Systems (FAST-OS) and the U.S. National Coordination Office
                   for Networking and Information Technology Research and
                   Development (NCO/NITRD) High-End Computing Revitalization
                   Task Force (HECRTF) activities. Based on virtualized
                   adaptation, reconfiguration, and preemptive measures, the
                   ultimate goal is to provide for non-stop scientific computing
                   on a 24x7 basis without interruption. The taken technical
                   approach leverages system-level virtualization technology to
                   enable transparent proactive and reactive fault tolerance
                   mechanisms on extreme scale HPC systems. This effort targets:
                   (1) reliability analysis for identifying pre-fault
                   indicators, predicting failures, and modeling and monitoring
                   component and system reliability, (2) proactive fault
                   tolerance technology based on preemptive migration away from
                   components that are about to fail, (3) reactive fault
                   tolerance enhancements, such as checkpoint interval and
                   placement adaptation to actual and predicted system health
                   threats, and (4) holistic fault tolerance through combination
                   of adaptive proactive and reactive fault tolerance."
}
@misc{engelmann10resilience,
  author        = "Christian Engelmann",
  title         = "Resilience Challenges at the Exascale",
  month         = mar # "~8-11, ",
  year          = "2010",
  howpublished  = "{Talk at the
                   \href{http://www.csm.ornl.gov/workshops/SOS14}{$14^{th}$
                   Workshop on Distributed Supercomputing (SOS) 2010}, Savannah,
                   GA, USA}",
  url           = "http://www.christian-engelmann.info/publications/engelmann10resilience.ppt.pdf",
  abstract      = "The path to exascale computing poses several research
                   challenges related to power, performance, resilience,
                   productivity, programmability, data movement, and data
                   management. Resilience, i.e., providing efficiency and
                   correctness in the presence of faults, is one of the most
                   important exascale computer science challenges as systems
                   scale up in component count and component reliability
                   decreases. This talk discusses the future needs in
                   resilience research, development, and standards work
                   based on the outcomes from the National HPC Workshop on
                   Resilience, held in Arlington, VA, USA on August 12-14,
                   2009."
}
@misc{engelmann10hpc,
  author        = "Christian Engelmann
                   and Stephen L. Scott",
  title         = "{HPC} System Software Research at {Oak Ridge National
                   Laboratory}",
  month         = feb # "~22, ",
  year          = "2010",
  howpublished  = "{Seminar at the \href{http://www.lrz-muenchen.de}{Leibniz
                    Rechenzentrum (LRZ)}, Garching, Germany}",
  url           = "http://www.christian-engelmann.info/publications/engelmann10hpc.ppt.pdf",
  abstract      = "Oak Ridge National Laboratory (ORNL) is the largest energy
                   laboratory in the United States. Its National Center for
                   Computational Sciences (NCCS) provides the most powerful
                   computing resources in the world for open scientific
                   research. Jaguar, a Cray XT5 system at NCCS, is the fastest
                   supercomputer in the world. It recently ranked #1 in the Top
                   500 List of Supercomputer Sites with a maximal LINPACK
                   benchmark performance of 1.759 PFlop/s and a theoretical peak
                   performance of 2.331 PFlop/s, where 1 PFlop/s is $10^{15}$
                   Floating Point Operations Per Second. Annually, 80 percent of
                   Jaguar's resources are allocated through the U.S Department
                   of Energy's Innovative and Novel Computational Impact on
                   Theory and Experiment (INCITE) program, a competitively
                   selected, peer reviewed process open to researchers from
                   universities, industry, government and non-profit
                   organizations. These allocations address some of the most
                   challenging scientific problems in areas such as climate
                   modeling, renewable energy, materials science, fusion and
                   combustion. In conjunction with NCCS, the Computer Science
                   and Mathematics Division at ORNL performs basic and applied
                   research in HPC, mathematics, and intelligent systems. This
                   talk gives a summary of the HPC research and development in
                   system software performed at ORNL, including resilience at
                   extreme scale and virtualization technologies in HPC.
                   Specifically, this talk will focus on advanced resilience
                   technologies, such as migration of computation away from
                   components that are about to fail and on management and
                   customization of virtualized environments."
}
@misc{engelmann09high2,
  author        = "Christian Engelmann",
  title         = "High-Performance Computing Research Internship and Appointment
                   Opportunities at {Oak Ridge National Laboratory}",
  month         = dec # "~14, ",
  year          = "2009",
  howpublished  = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
                   of Computer Science}, \href{http://www.reading.ac.uk}
                   {University of Reading}, Reading, United Kingdom}",
  url           = "http://www.christian-engelmann.info/publications/engelmann09high2.ppt.pdf",
  abstract      = "Oak Ridge National Laboratory (ORNL) is the largest energy
                   laboratory in the United States. Its National Center for
                   Computational Sciences (NCCS) provides the most powerful
                   computing resources in the world for open scientific
                   research. Jaguar, a Cray XT5 system at NCCS, is the fastest
                   supercomputer in the world. It recently ranked #1 in the Top
                   500 List of Supercomputer Sites with a maximal LINPACK
                   benchmark performance of 1.759 PFlop/s and a theoretical peak
                   performance of 2.331 PFlop/s, where 1 PFlop/s is $10^{15}$
                   Floating Point Operations Per Second. Annually, 80 percent of
                   Jaguar's resources are allocated through the U.S Department
                   of Energy's Innovative and Novel Computational Impact on 
                   Theory and Experiment (INCITE) program, a competitively
                   selected, peer reviewed process open to researchers from
                   universities, industry, government and non-profit
                   organizations. These allocations address some of the most
                   challenging scientific problems in areas such as climate
                   modeling, renewable energy, materials science, fusion and
                   combustion. In conjunction with NCCS, the Computer Science
                   and Mathematics Division at ORNL performs basic and applied
                   research in HPC, mathematics, and intelligent systems. This
                   talk gives a summary of the HPC research performed at ORNL.
                   It provides details about the Jaguar peta-scale computing
                   resource, an overview of the computational science research
                   carried out using ORNL's computing resources, and a
                   description of various computer science efforts targeting
                   solutions for next-generation HPC systems. This talk also
                   provides information about internship opportunities for MSc
                   students and research appointment opportunities for recent
                   graduates."
}
@misc{engelmann09jcas,
  author        = "Christian Engelmann",
  title         = "{JCAS} - {IAA} Simulation Efforts at {Oak Ridge National
                   Laboratory}",
  month         = sep # "~1-2, ",
  year          = "2009",
  howpublished  = "{Invited talk at the
                   \href{http://www.cs.sandia.gov/CSRI/Workshops/2009/IAA}
                   {IAA Workshop on HPC Architectural Simulation (HPCAS)},
                   Boulder, CO, USA}",
  url           = "http://www.christian-engelmann.info/publications/engelmann09jcas.ppt.pdf"
}
@misc{engelmann09modeling,
  author        = "Christian Engelmann",
  title         = "Modeling Techniques Towards Resilience",
  month         = aug # "~12-14, ",
  year          = "2009",
  howpublished  = "{Invited talk at the
                   \href{http://institute.lanl.gov/resilience/conferences/2009}
                   {National HPC Workshop on Resilience 2009}, Arlington, VA,
                   USA}",
  url           = "http://www.christian-engelmann.info/publications/engelmann09modeling.ppt.pdf"
}
@misc{engelmann09system,
  author        = "Christian Engelmann",
  title         = "System Resilience Research at {ORNL} in the Context of
                   {HPC}",
  month         = may # "~15, ",
  year          = "2009",
  howpublished  = "{Invited talk at the \href{http://www.inria.fr/inria/organigramme/fiche_ur-ren.fr.html}
                   {Institut National de Recherche en Informatique et en
                   Automatique (INRIA)}, Rennes, France}",
  url           = "http://www.christian-engelmann.info/publications/engelmann09system.pdf",
  abstract      = "The continuing growth in high performance computing (HPC)
                   system scale poses a challenge for system software and
                   scientific applications with respect to reliability,
                   availability and serviceability (RAS). With only very few
                   exceptions, the availability of recently installed systems
                   has been lower in comparison to the same deployment phase of
                   their predecessors. As a result, sites lower allowable job
                   run times in order to force applications to store
                   intermediate results (checkpoints) as insurance against lost
                   computation time. However, checkpoints themselves waste
                   valuable computation time and resources. In contrast to the
                   experienced loss of availability, the demand for continuous
                   availability has risen dramatically with the trend towards
                   capability computing, which drives the race for scientific
                   discovery by running applications on the fastest machines
                   available while desiring significant amounts of time (weeks
                   and months) without interruption. These machines must be able
                   to run in the event of frequent interrupts in such a manner
                   that the capability is not severely degraded. Thus, research
                   and development of scalable RAS technologies is paramount to
                   the success of future extreme-scale systems. This talk
                   summarizes our accomplishments in the area of high-level RAS
                   for HPC, such as developed concepts and implemented
                   proof-of-concept prototypes."
}
@misc{engelmann09high,
  author        = "Christian Engelmann",
  title         = "High-Performance Computing Research and {MSc} Internship
                   Opportunities at {Oak Ridge National Laboratory}",
  month         = may # "~11, ",
  year          = "2009",
  howpublished  = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
                   of Computer Science}, \href{http://www.reading.ac.uk}
                   {University of Reading}, Reading, United Kingdom}",
  url           = "http://www.christian-engelmann.info/publications/engelmann09high.pdf",
  abstract      = "Oak Ridge National Laboratory (ORNL) is the largest energy
                   laboratory in the United States. Its National Center for
                   Computational Sciences (NCCS) provides the most powerful
                   computing resources in the world for open scientific
                   research. Jaguar, a Cray XT5 system at NCCS, is the second
                   HPC system to exceed 1 PFlop/s ($10^{15}$ Floating Point
                   Operations Per Second), and the fastest open science
                   supercomputer in the world. It recently ranked #2 in the Top
                   500 List of Supercomputer Sites with a maximal LINPACK
                   benchmark performance of 1.059 PFlop/s and a theoretical peak
                   performance of 1.3814 PFlop/s. Annually, 80 percent of
                   Jaguar's resources are allocated through the U.S Department
                   of Energy's Innovative and Novel Computational Impact on
                   Theory and Experiment (INCITE) program, a competitively
                   selected, peer reviewed process open to researchers from
                   universities, industry, government and non-profit
                   organizations. These allocations address some of the most
                   challenging scientific problems in areas such as climate
                   modeling, renewable energy, materials science, fusion and
                   combustion. In conjunction with NCCS, the Computer Science
                   and Mathematics Division at ORNL performs basic and applied
                   research in HPC, mathematics, and intelligent systems. This
                   talk gives a summary of the HPC research performed at ORNL.
                   It provides details about the Jaguar peta-scale computing
                   resource, an overview of the computational science research
                   carried out using ORNL's computing resources, and a
                   description of various computer science efforts targeting
                   solutions for next-generation HPC systems. This talk also
                   provides information about internship opportunities for MSc
                   students."
}
@misc{engelmann09modular,
  author        = "Christian Engelmann",
  title         = "Modular Redundancy for Soft-Error Resilience in Large-Scale
                   {HPC} Systems",
  month         = may # "~3-8, ",
  year          = "2009",
  howpublished  = "{Invited talk at the \href{http://www.dagstuhl.de/en/program/calendar/semhp/?semnr=09191}
                   {Dagstuhl Seminar on Fault Tolerance in High-Performance
                   Computing and Grids}, Schloss Dagstuhl, Wadern, Germany}",
  url           = "http://www.christian-engelmann.info/publications/engelmann09modular.pdf",
  abstract      = "Recent investigations into resilience of large-scale
                   high-performance computing (HPC) systems showed a continuous
                   trend of decreasing reliability and availability. Newly
                   installed systems have a lower mean-time to failure (MTTF)
                   and a higher mean-time to recover (MTTR) than their
                   predecessors. Modular redundancy is being used in many
                   mission critical systems today to provide for resilience,
                   such as for aerospace and command & control systems. The
                   primary argument against modular redundancy for resilience
                   in HPC has always been that the capability of a HPC system,
                   and respective return on investment, would be significantly
                   reduced. We argue that modular redundancy can significantly
                   increase compute node availability as it removes the impact
                   of scale from single compute node MTTR. We further argue that
                   single compute nodes can be much less reliable, and therefore
                   less expensive, and still be highly available, if their
                   MTTR/MTTF ratio is maintained."
}
@misc{engelmann09proactive2,
  author        = "Christian Engelmann",
  title         = "Proactive Fault Tolerance Using Preemptive Migration",
  month         = apr # "~22-24, ",
  year          = "2009",
  howpublished  = "{Invited talk at the
                   \href{http://acet.rdg.ac.uk/events/details/cancun.php}
                   {$3^{rd}$ Collaborative and Grid Computing Technologies
                   Workshop (CGCTW) 2009}, Cancun, Mexico}",
  url           = "http://www.christian-engelmann.info/publications/engelmann09proactive2.pdf",
  abstract      = "The continuing growth in high-performance computing (HPC)
                   system scale poses a challenge for system software and
                   scientific applications with respect to reliability,
                   availability and serviceability (RAS). In order to address
                   anticipated high failure rates, resiliency characteristics
                   have become an urgent priority for next-generation HPC
                   systems. The concept of proactive fault tolerance prevents
                   compute node failures from impacting running parallel
                   applications by preemptively migrating application parts
                   away from nodes that are about to fail. This talk presents
                   our past and ongoing efforts in proactive fault resilience
                   for HPC. Presented work includes proactive fault resilience
                   techniques, transparent process- and virtual-machine-level
                   migration, system and application reliability models and
                   analyses, failure prediction, and trade-off models for
                   combining preemptive migration with checkpoint/restart. All
                   these individual technologies are put into context with a
                   proposed holistic HPC fault resilience framework."
}
@misc{engelmann09resiliency,
  author        = "Christian Engelmann",
  title         = "Resiliency",
  month         = mar # "~9-12, ",
  year          = "2009",
  howpublished  = "{Panel at the
                   \href{http://www.cs.sandia.gov/Conferences/SOS13}{$13^{th}$
                   Workshop on Distributed Supercomputing (SOS) 2009}, Hilton
                   Head, SC, USA}"
}
@misc{engelmann08high,
  author        = "Christian Engelmann",
  title         = "High-Performance Computing Research at {Oak Ridge National
                   Laboratory}",
  month         = dec # "~8, ",
  year          = "2008",
  howpublished  = "{Invited talk at the Reading Annual Computational Science
                    Workshop, Reading, United Kingdom}",
  url           = "http://www.christian-engelmann.info/publications/engelmann08high.pdf",
  abstract      = "Oak Ridge National Laboratory (ORNL) is the largest energy
                   laboratory in the United States. Its National Center for
                   Computational Sciences (NCCS) provides the most powerful
                   computing resources in the world for open scientific
                   research. Jaguar, a Cray XT5 system at NCCS, is the second
                   HPC system to exceed 1 PFlop/s (10^15 Floating Point
                   Operations Per Second), and the fastest open science
                   supercomputer in the world. It recently ranked #2 in the Top
                   500 List of Supercomputer Sites with a maximal LINPACK
                   benchmark performance of 1.059 PFlop/s and a theoretical peak
                   performance of 1.3814 PFlop/s. Annually, 80 percent of
                   Jaguar’s resources are allocated through the U.S Department
                   of Energy’s Innovative and Novel Computational Impact on
                   Theory and Experiment (INCITE) program, a competitively
                   selected, peer reviewed process open to researchers from
                   universities, industry, government and non-profit
                   organizations. These allocations address some of the most
                   challenging scientific problems in areas such as climate
                   modeling, renewable energy, materials science, fusion and
                   combustion. In conjunction with NCCS, the Computer Science
                   and Mathematics Division at ORNL performs basic and applied
                   research in HPC, mathematics, and intelligent systems. This
                   talk gives a summary of the HPC research performed at ORNL.
                   It provides details about the Jaguar peta-scale computing
                   resource, an overview of the computational science research
                   carried out using ORNL’s computing resources, and a
                   description of various computer science efforts targeting
                   solutions for next-generation HPC systems."
}
@misc{engelmann08modular,
  author        = "Christian Engelmann",
  title         = "Modular Redundancy in {HPC} Systems: {W}hy, Where, When and How?",
  month         = oct # "~15, ",
  year          = "2008",
  howpublished  = "{Invited talk at the $1^{st}$ HPC Resiliency Summit: Workshop
                   on Resiliency for Petascale HPC 2008, in conjunction with the
                   \href{http://www.lanl.gov/conferences/lacss/2008}{$1^{st}$
                   Los Alamos Computer Science Symposium (LACSS) 2008}, Santa
                   Fe, NM, USA}",
  url           = "http://www.christian-engelmann.info/publications/engelmann08modular.ppt.pdf",
  abstract      = "The continuing growth in high-performance computing (HPC)
                   system scale poses a challenge for system software and
                   scientific applications with respect to reliability,
                   availability and serviceability (RAS). With only very few
                   exceptions, the availability of recently installed systems
                   has been lower in comparison to the same deployment phase of
                   their predecessors. As a result, sites lower allowable job
                   run times in order to force applications to store
                   intermediate results (checkpoints) as insurance against lost
                   computation time. However, checkpoints themselves waste
                   valuable computation time and resources. In contrast to the
                   experienced loss of availability, the demand for continuous
                   availability has risen dramatically with the trend towards
                   capability computing, which drives the race for scientific
                   discovery by running applications on the fastest machines
                   available while desiring significant amounts of time (weeks
                   and months) without interruption. These machines must be able
                   to run in the event of frequent interrupts in such a manner
                   that the capability is not severely degraded. Thus, research
                   and development of scalable RAS technologies is paramount to
                   the success of future extreme-scale systems. This talk
                   summarizes our past accomplishments, ongoing work, and future
                   plans in the area of high-level RAS for HPC."
}
@misc{engelmann08resiliency,
  author        = "Christian Engelmann",
  title         = "Resiliency for High-Performance Computing",
  month         = apr # "~10-12, ",
  year          = "2008",
  howpublished  = "{Invited talk at the
                   \href{http://acet.rdg.ac.uk/events/details/cancun.php}
                   {$2^{nd}$ Collaborative and Grid Computing Technologies
                   Workshop (CGCTW) 2008}, Cancun, Mexico}",
  url           = "http://www.christian-engelmann.info/publications/engelmann08resiliency.ppt.pdf",
  abstract      = "In order to address anticipated high failure rates,
                   resiliency characteristics have become an urgent priority for
                   next-generation high-performance computing (HPC) systems. One
                   major source of concern are non-recoverable soft errors,
                   i.e., bit flips in memory, cache, registers, and logic. The
                   probability of such errors not only grows with system size,
                   but also with increasing architectural vulnerability caused
                   by employing accelerators and by shrinking nanometer
                   technology. Reactive fault tolerance technologies, such as
                   checkpoint/restart, are unable to handle high failure rates
                   due to associated overheads, while proactive resiliency
                   technologies, such as preemptive migration, simply fail as
                   random soft errors can't be predicted. This talk proposes a
                   new, bold direction in resiliency for HPC as it targets
                   resiliency for next-generation extreme-scale HPC systems at
                   the system software level through computational redundancy
                   strategies, i.e., dual- and triple-modular redundancy."
}
@misc{engelmann08advanced,
  author        = "Christian Engelmann",
  title         = "Advanced Fault Tolerance Solutions for High Performance
                   Computing",
  month         = feb # "~11, ",
  year          = "2008",
  howpublished  = "{Seminar at the \href{http://www.laas.fr}{Laboratoire
                   d'Analyse et d'Architecture des Syst\`emes},
                   \href{http://www.cnrs.fr}{Centre National de la Recherche
                   Scientifique}, Toulouse, France}",
  url           = "http://www.christian-engelmann.info/publications/engelmann08advanced.ppt.pdf",
  abstract      = "The continuing growth in high performance computing (HPC)
                   system scale poses a challenge for system software and
                   scientific applications with respect to reliability,
                   availability and serviceability (RAS). With only very few
                   exceptions, the availability of recently installed systems
                   has been lower in comparison to the same deployment phase of
                   their predecessors. As a result, sites lower allowable job
                   run times in order to force applications to store
                   intermediate results (checkpoints) as insurance against lost
                   computation time. However, checkpoints themselves waste
                   valuable computation time and resources. In contrast to the
                   experienced loss of availability, the demand for continuous
                   availability has risen dramatically with the trend towards
                   capability computing, which drives the race for scientific
                   discovery by running applications on the fastest machines
                   available while desiring significant amounts of time (weeks
                   and months) without interruption. These machines must be able
                   to run in the event of frequent interrupts in such a manner
                   that the capability is not severely degraded. Thus, research
                   and development of scalable RAS technologies is paramount to
                   the success of future extreme-scale systems. This talk
                   summarizes our accomplishments in the area of high-level RAS
                   for HPC, such as developed concepts and implemented
                   proof-of-concept prototypes, and describes existing
                   limitations, such as performance issues, which need to be
                   dealt with for production-type deployment."
}
@misc{engelmann07service,
  author        = "Christian Engelmann",
  title         = "Service-Level High Availability in Parallel and Distributed
                   Systems",
  month         = oct # "~10, ",
  year          = "2007",
  howpublished  = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
                   of Computer Science}, \href{http://www.reading.ac.uk}
                   {University of Reading}, Reading, United Kingdom}",
  url           = "http://www.christian-engelmann.info/publications/engelmann07service.pdf",
  abstract      = "As service-oriented architectures become more important in
                   parallel and distributed computing systems, individual
                   service instance reliability as well as appropriate service
                   redundancy are essential to increase overall system
                   availability. This talk focuses on redundancy strategies
                   using service-level replication techniques. An overview of
                   existing programming models for service-level high
                   availability is presented and their differences,
                   similarities, advantages, and disadvantages are discussed.
                   Recent advances in providing service-level symmetric
                   active/active high availability are discussed. While the
                   primary target of the presented research is high availability
                   for service nodes in tightly-coupled extreme-scale
                   high-performance computing (HPC) systems, it is also
                   applicable to loosely-coupled distributed computing
                   scenarios."
}
@misc{engelmann07advanced2,
  author        = "Christian Engelmann",
  title         = "Advanced Fault Tolerance Solutions for High Performance
                   Computing",
  month         = jun # "~8, ",
  year          = "2007",
  howpublished  = "{Invited talk at the
                   \href{http://www.thaigrid.or.th/wttc2007}{Workshop on Trends,
                   Technologies and Collaborative Opportunities in High
                   Performance and Grid Computing (WTTC) 2007}, Khon Kean,
                   Thailand}",
  url           = "http://www.christian-engelmann.info/publications/engelmann07advanced2.ppt.pdf",
  abstract      = "The continuing growth in high performance computing (HPC)
                   system scale poses a challenge for system software and
                   scientific applications with respect to reliability,
                   availability and serviceability (RAS). With only very few
                   exceptions, the availability of recently installed systems
                   has been lower in comparison to the same deployment phase of
                   their predecessors. As a result, sites lower allowable job
                   run times in order to force applications to store
                   intermediate results (checkpoints) as insurance against lost
                   computation time. However, checkpoints themselves waste
                   valuable computation time and resources. In contrast to the
                   experienced loss of availability, the demand for continuous
                   availability has risen dramatically with the trend towards
                   capability computing, which drives the race for scientific
                   discovery by running applications on the fastest machines
                   available while desiring significant amounts of time (weeks
                   and months) without interruption. These machines must be able
                   to run in the event of frequent interrupts in such a manner
                   that the capability is not severely degraded. Thus, research
                   and development of scalable RAS technologies is paramount to
                   the success of future extreme-scale systems. This talk
                   summarizes our accomplishments in the area of high-level RAS
                   for HPC, such as developed concepts and implemented
                   proof-of-concept prototypes, and describes existing
                   limitations, such as performance issues, which need to be
                   dealt with for production-type deployment."
}
@misc{engelmann07advanced,
  author        = "Christian Engelmann",
  title         = "Advanced Fault Tolerance Solutions for High Performance
                   Computing",
  month         = jun # "~4-5, ",
  year          = "2007",
  howpublished  = "{Invited talk at the
                   \href{http://www.thaigrid.or.th/wttc2007}{Workshop on Trends,
                   Technologies and Collaborative Opportunities in High
                   Performance and Grid Computing (WTTC) 2007}, Khon Kean,
                   Thailand}",
  url           = "http://www.christian-engelmann.info/publications/engelmann07advanced.ppt.pdf",
  abstract      = "The continuing growth in high performance computing (HPC)
                   system scale poses a challenge for system software and
                   scientific applications with respect to reliability,
                   availability and serviceability (RAS). With only very few
                   exceptions, the availability of recently installed systems
                   has been lower in comparison to the same deployment phase of
                   their predecessors. As a result, sites lower allowable job
                   run times in order to force applications to store
                   intermediate results (checkpoints) as insurance against lost
                   computation time. However, checkpoints themselves waste
                   valuable computation time and resources. In contrast to the
                   experienced loss of availability, the demand for continuous
                   availability has risen dramatically with the trend towards
                   capability computing, which drives the race for scientific
                   discovery by running applications on the fastest machines
                   available while desiring significant amounts of time (weeks
                   and months) without interruption. These machines must be
                   able to run in the event of frequent interrupts in such a
                   manner that the capability is not severely degraded. Thus,
                   research and development of scalable RAS technologies is
                   paramount to the success of future extreme-scale systems.
                   This talk summarizes our accomplishments in the area of
                   high-level RAS for HPC, such as developed concepts and
                   implemented proof-of-concept prototypes, and describes
                   existing limitations, such as performance issues, which
                   need to be dealt with for production-type deployment."
}
@misc{engelmann07operating,
  author        = "Christian Engelmann",
  title         = "Operating System Research at {ORNL}: {S}ystem-level
                   Virtualization",
  month         = apr # "~10, ",
  year          = "2007",
  howpublished  = "{Seminar at the \href{http://www.gup.uni-linz.ac.at}
                   {Institute of Graphics and Parallel Processing},
                   \href{http://www.uni-linz.ac.at}{Johannes Kepler University},
                   Linz, Austria}",
  url           = "http://www.christian-engelmann.info/publications/engelmann07operating.ppt.pdf",
  abstract      = "The emergence of virtualization enabled hardware, such as the
                   latest generation AMD and Intel processors, has raised
                   significant interest in High Performance Computing (HPC)
                   community. In particular, system-level virtualization
                   provides an opportunity to advance the design and development
                   of operating systems, programming environments,
                   administration practices, and resource management tools. This
                   leads to some potential research topics for HPC, such as
                   failure tolerance, system management, and solutions for
                   application porting to new HPC platforms. This talk will
                   present an overview of the research in System-level
                   Virtualization taking place by the Systems Research Team in
                   the Computer Science Research Group at Oak Ridge National
                   Laboratory."
}
@misc{engelmann07towards,
  author        = "Christian Engelmann",
  title         = "Towards High Availability for High-Performance Computing
                   System Services: {A}ccomplishments and Limitations",
  month         = mar # "~14, ",
  year          = "2007",
  howpublished  = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
                   of Computer Science}, \href{http://www.reading.ac.uk}
                   {University of Reading}, Reading, United Kingdom}",
  url           = "http://www.christian-engelmann.info/publications/engelmann07towards.pdf",
  abstract      = "During the last several years, our teams at Oak Ridge
                   National Laboratory, Louisiana Tech University, and Tennessee
                   Technological University focused on efficient redundancy
                   strategies for head and service nodes of high-performance
                   computing (HPC) systems in order to pave the way for high
                   availability (HA) in HPC. These nodes typically run critical
                   HPC system services, like job and resource management, and
                   represent single points of failure and control for an entire
                   HPC system. The overarching goal of our research is to
                   provide high-level reliability, availability, and
                   serviceability (RAS) for HPC systems by combining HA and HPC
                   technology. This talk summarizes our accomplishments, such as
                   developed concepts and implemented proof-of-concept
                   prototypes, and describes existing limitations, such as
                   performance issues, which need to be dealt with for
                   production-type deployment."
}
@misc{engelmann06high,
  author        = "Christian Engelmann",
  title         = "High Availability for Ultra-Scale High-End Scientific
                   Computing",
  month         = jun # "~9, ",
  year          = "2006",
  howpublished  = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
                   of Computer Science}, \href{http://www.reading.ac.uk}
                   {University of Reading}, Reading, United Kingdom}",
  url           = "http://www.christian-engelmann.info/publications/engelmann06high.ppt.pdf",
  abstract      = "A major concern in exploiting ultra-scale architectures for
                   scientific high-end computing (HEC) with tens to hundreds of
                   thousands of processors, such as the IBM Blue Gene/L and the
                   Cray X1, is the potential inability to identify problems and
                   take preemptive action before a failure impacts a running
                   job. In fact, in systems of this scale, predictions estimate
                   the mean time to interrupt in terms of hours. Current
                   solutions for fault-tolerance in HEC focus on dealing with
                   the result of a failure. However, most are unable to handle
                   runtime system configuration changes caused by failures and
                   require a complete restart of essential system services
                   (e.g. MPI) or even of the entire machine. High availability
                   (HA) computing strives to avoid the problems of unexpected
                   failures through preemptive measures. There are various
                   techniques to implement high availability. In contrast to
                   active/hot-standby high availability with its fail-over
                   model, active/active high availability with its virtual
                   synchrony model is superior in many areas including
                   scalability, throughput, availability and responsiveness.
                   However, it is significantly more complex. The overall goal
                   of our research is to expand today`s effort in HA for HEC,
                   so that systems that have the ability to hot-swap hardware
                   components can be kept alive by an OS runtime environment
                   that understands the concept of dynamic system configuration.
                   This talk will present an overview of recent research at Oak
                   Ridge National Laboratory in high availability solutions for
                   ultra-scale scientific high-end computing."
}
@misc{scott06advancing,
  author        = "Stephen L. Scott
                   and Christian Engelmann",
  title         = "Advancing Reliability, Availability and Serviceability for
                   High-Performance Computing",
  month         = apr # "~19, ",
  year          = "2006",
  howpublished  = "{Seminar at the \href{http://www.gup.uni-linz.ac.at}
                   {Institute of Graphics and Parallel Processing},
                   \href{http://www.uni-linz.ac.at}{Johannes Kepler University},
                   Linz, Austria}",
  url           = "http://www.christian-engelmann.info/publications/scott06advancing.ppt.pdf",
  abstract      = "Today’s high performance computing systems have several
                   reliability deficiencies resulting in noticeable availability
                   and serviceability issues. For example, head and service
                   nodes represent a single point of failure and control for an
                   entire system as they render it inaccessible and unmanageable
                   in case of a failure until repair, causing a significant
                   downtime. Furthermore, current solutions for fault-tolerance
                   focus on dealing with the result of a failure. However, most
                   are unable to transparently mask runtime system configuration
                   changes caused by failures and require a complete restart of
                   essential system services, such as MPI, in case of a failure.
                   High availability computing strives to avoid the problems of
                   unexpected failures through preemptive measures. The overall
                   goal of our research is to expand today’s effort in high
                   availability for high-performance computing, so that systems
                   can be kept alive by an OS runtime environment that
                   understands the concepts of dynamic system configuration and
                   degraded operation mode. This talk will present an overview
                   of recent research performed at Oak Ridge National Laboratory
                   in collaboration with Louisiana Tech University, North
                   Carolina State University and the University of Reading in
                   developing core technologies and proof-of-concept prototypes
                   that improve the overall reliability, availability and
                   serviceability of high-performance computing systems."
}
@misc{engelmann05high4,
  author        = "Christian Engelmann",
  title         = "High Availability for Ultra-Scale High-End Scientific
                   Computing",
  month         = oct # "~18, ",
  year          = "2005",
  howpublished  = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
                   of Computer Science}, \href{http://www.reading.ac.uk}
                   {University of Reading}, Reading, United Kingdom}",
  url           = "http://www.christian-engelmann.info/publications/engelmann05high4.ppt.pdf",
  abstract      = "A major concern in exploiting ultra-scale architectures for
                   scientific high-end computing (HEC) with tens to hundreds of
                   thousands of processors, such as the IBM Blue Gene/L and the
                   Cray X1, is the potential inability to identify problems and
                   take preemptive action before a failure impacts a running
                   job. In fact, in systems of this scale, predictions estimate
                   the mean time to interrupt in terms of hours. Current
                   solutions for fault-tolerance in HEC focus on dealing with
                   the result of a failure. However, most are unable to handle
                   runtime system configuration changes caused by failures and
                   require a complete restart of essential system services (e.g.
                   MPI) or even of the entire machine. High availability (HA)
                   computing strives to avoid the problems of unexpected
                   failures through preemptive measures. There are various
                   techniques to implement high availability. In contrast to
                   active/hot-standby high availability with its fail-over
                   model, active/active high availability with its virtual
                   synchrony model is superior in many areas including
                   scalability, throughput, availability and responsiveness.
                   However, it is significantly more complex. The overall goal
                   of our research is to expand today`s effort in HA for HEC, so
                   that systems that have the ability to hot-swap hardware
                   components can be kept alive by an OS runtime environment
                   that understands the concept of dynamic system configuration.
                   This talk will present an overview of recent research at Oak
                   Ridge National Laboratory in high availability solutions for
                   ultra-scale scientific high-end computing."
}
@misc{engelmann05high3,
  author        = "Christian Engelmann",
  title         = "High Availability for Ultra-Scale High-End Scientific
                   Computing",
  month         = sep # "~26, ",
  year          = "2005",
  howpublished  = "{Seminar at the \href{http://www.uncfsu.edu/macsc}{Department
                   of Mathematics and Computer Science},
                   \href{http://www.uncfsu.edu}{Fayetteville State University},
                   Fayetteville, NC, USA}",
  url           = "http://www.christian-engelmann.info/publications/engelmann05high3.ppt.pdf",
  abstract      = "A major concern in exploiting ultra-scale architectures for
                   scientific high-end computing (HEC) with tens to hundreds of
                   thousands of processors, such as the IBM Blue Gene/L and the
                   Cray X1, is the potential inability to identify problems and
                   take preemptive action before a failure impacts a running
                   job. In fact, in systems of this scale, predictions estimate
                   the mean time to interrupt in terms of hours. Current
                   solutions for fault-tolerance in HEC focus on dealing with
                   the result of a failure. However, most are unable to handle
                   runtime system configuration changes caused by failures and
                   require a complete restart of essential system services (e.g.
                   MPI) or even of the entire machine. High availability (HA)
                   computing strives to avoid the problems of unexpected
                   failures through preemptive measures. There are various
                   techniques to implement high availability. In contrast to
                   active/hot-standby high availability with its fail-over
                   model, active/active high availability with its virtual
                   synchrony model is superior in many areas including
                   scalability, throughput, availability and responsiveness.
                   However, it is significantly more complex. The overall goal
                   of our research is to expand today’s effort in HA for HEC, so
                   that systems that have the ability to hot-swap hardware
                   components can be kept alive by an OS runtime environment
                   that understands the concept of dynamic system configuration.
                   This talk will present an overview of recent research at Oak
                   Ridge National Laboratory in fault tolerance and high
                   availability solutions for ultra-scale scientific high-end
                   computing."
}
@misc{engelmann05high2,
  author        = "Christian Engelmann",
  title         = "High Availability for Ultra-Scale High-End Scientific
                   Computing",
  month         = may # "~13, ",
  year          = "2005",
  howpublished  = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
                   of Computer Science}, \href{http://www.reading.ac.uk}
                   {University of Reading}, Reading, United Kingdom}",
  url           = "http://www.christian-engelmann.info/publications/engelmann05high2.ppt.pdf",
  abstract      = "A major concern in exploiting ultra-scale architectures for
                   scientific high-end computing (HEC) with tens to hundreds of
                   thousands of processors, such as the IBM Blue Gene/L and the
                   Cray X1, is the potential inability to identify problems and
                   take preemptive action before a failure impacts a running
                   job. In fact, in systems of this scale, predictions estimate
                   the mean time to interrupt in terms of hours. Current
                   solutions for fault-tolerance in HEC focus on dealing with
                   the result of a failure. However, most are unable to handle
                   runtime system configuration changes caused by failures and
                   require a complete restart of essential system services (e.g.
                   MPI) or even of the entire machine. High availability (HA)
                   computing strives to avoid the problems of unexpected
                   failures through preemptive measures. There are various
                   techniques to implement high availability. In contrast to
                   active/hot-standby high availability with its fail-over
                   model, active/active high availability with its virtual
                   synchrony model is superior in many areas including
                   scalability, throughput, availability and responsiveness.
                   However, it is significantly more complex. The overall goal
                   of our research is to expand today’s effort in HA for HEC,
                   so that systems that have the ability to hot-swap hardware
                   components can be kept alive by an OS runtime environment
                   that understands the concept of dynamic system configuration.
                   This talk will present an overview of recent research at Oak
                   Ridge National Laboratory in fault-tolerant heterogeneous
                   metacomputing, advanced super-scalable algorithms and high
                   availability system software for ultra-scale scientific
                   high-end computing."
}
@misc{engelmann05high1,
  author        = "Christian Engelmann",
  title         = "High Availability for Ultra-Scale High-End Scientific
                   Computing",
  month         = apr # "~15, ",
  year          = "2005",
  howpublished  = "{Seminar at the \href{http://cenit.latech.edu}{Center for
                   Entrepreneurship and Information Technology},
                   \href{http://www.latech.edu}{Louisiana Tech University},
                   Ruston, LA, USA}",
  url           = "http://www.christian-engelmann.info/publications/engelmann05high1.ppt.pdf",
  abstract      = "A major concern in exploiting ultra-scale architectures for
                   scientific high-end computing (HEC) with tens to hundreds of
                   thousands of processors is the potential inability to
                   identify problems and take preemptive action before a failure
                   impacts a running job. In fact, in systems of this scale,
                   predictions estimate the mean time to interrupt in terms of
                   hours. Current solutions for fault-tolerance in HEC focus on
                   dealing with the result of a failure. However, most are
                   unable to handle runtime system configuration changes caused
                   by failures and require a complete restart of essential
                   system services (e.g. MPI) or even of the entire machine.
                   High availability (HA) computing strives to avoid the
                   problems of unexpected failures through preemptive measures.
                   There are various techniques to implement high availability.
                   In contrast to active/hot-standby high availability with its
                   fail-over model, active/active high availability with its
                   virtual synchrony model is superior in many areas including
                   scalability, throughput, availability and responsiveness.
                   However, it is significantly more complex. The overall goal
                   of this research is to expand today’s effort in HA for HEC,
                   so that systems that have the ability to hot-swap hardware
                   components can be kept alive by an OS runtime environment
                   that understands the concept of dynamic system configuration.
                   With the aim of addressing the future challenges of high
                   availability in ultra-scale HEC, this project intends to
                   develop a proof-of-concept implementation of an active/active
                   high availability system software framework."
}
@misc{engelmann04diskless,
  author        = "Christian Engelmann",
  title         = "Diskless Checkpointing on Super-scale Architectures --
                   {A}pplied to the Fast Fourier Transform",
  month         = feb # "~25, ",
  year          = "2004",
  howpublished  = "{Invited talk at the \href{http://www.siam.org/meetings/pp04}
                   {$11^{th}$ SIAM Conference on Parallel Processing for
                   Scientific Computing (SIAM PP) 2004}, San Francisco, CA,
                   USA}",
  url           = "http://www.christian-engelmann.info/publications/engelmann04diskless.ppt.pdf",
  abstract      = "This talk discusses the issue of fault-tolerance in
                   distributed computer systems with tens or hundreds of
                   thousands of diskless processor units. Such systems, like the
                   IBM Blue Gene/L, are predicted to be deployed in the next
                   five to ten years. Since a 100,000-processor system is going
                   to be less reliable, scientific applications need to be able
                   to recover from occurring failures more efficiently. In this
                   paper, we adapt the present technique of diskless
                   checkpointing to such huge distributed systems in order to
                   equip existing scientific algorithms with super-scalable
                   fault-tolerance. First, we discuss the method of diskless
                   checkpointing, then we adapt this technique to super-scale
                   architectures and finally we present results from an
                   implementation of the Fast Fourier Transform that uses the
                   adapted technique to achieve super-scale fault-tolerance."
}
@misc{engelmann04superscalable,
  author        = "Christian Engelmann",
  title         = "Super-scalable Algorithms -- {N}ext Generation Supercomputing
                   on 100,000 and more Processors",
  month         = jan # "~29, ",
  year          = "2004",
  howpublished  = "{Seminar at the \href{http://www.csm.ornl.gov}{Computer
                   Science and Mathematics Division}, \href{http://www.ornl.gov}
                   {Oak Ridge National Laboratory}, Oak Ridge, TN, USA}",
  url           = "http://www.christian-engelmann.info/publications/engelmann04superscalable.ppt.pdf",
  abstract      = "This talk discusses recent research into the issues and
                   potential problems of algorithm scalability and
                   fault-tolerance on next-generation high-performance computer
                   systems with tens and even hundreds of thousands of
                   processors. Such massively parallel computers, like the IBM
                   Blue Gene/L, are going to be deployed in the next five to ten
                   years and existing deficiencies in scalability and
                   fault-tolerance need to be addressed soon. Scientific
                   algorithms have shown poor scalability on 10,000-processor
                   systems that exist today. Furthermore, future systems will be
                   less reliable due to the large number of components.
                   Super-scalable algorithms, which have the properties of scale
                   invariance and natural fault-tolerance, are able to get the
                   correct answer despite multiple task failures and without
                   checkpointing. We will show that such algorithms exist for a
                   wide variety of problems, such as finite difference, finite
                   element, multigrid and global maximum. Despite these
                   findings, traditional algorithms may still be preferred due
                   to their known behavior, or simply because a super-scalable
                   algorithm does not exist or is hard to find for a particular
                   problem. In this case, we propose a peer-to-peer diskless
                   checkpointing algorithm that can provide scale invariant
                   fault-tolerance."
}
@misc{engelmann03distributed,
  author        = "Christian Engelmann",
  title         = "Distributed Peer-to-Peer Control for {Harness}",
  month         = feb # "~11, ",
  year          = "2004",
  howpublished  = "{Seminar at the \href{http://www.csc.ncsu.edu}{Department of
                   Computer Science}, \href{http://www.ncsu.edu}{North Carolina
                   State University}, Raleigh, NC, USA}",
  url           = "http://www.christian-engelmann.info/publications/engelmann03distributed.ppt.pdf",
  abstract      = "Harness is an adaptable fault-tolerant virtual machine
                   environment for next-generation heterogeneous distributed
                   computing developed as a follow on to PVM. It additionally
                   enables the assembly of applications from plug-ins and
                   provides fault-tolerance. This work describes the distributed
                   control, which manages global state replication to ensure a
                   high-availability of service. Group communication services
                   achieve an agreement on an initial global state and a linear
                   history of global state changes at all members of the
                   distributed virtual machine. This global state is replicated
                   to all members to easily recover from single, multiple and
                   cascaded faults. A peer-to-peer ring network architecture and
                   tunable multi-point failure conditions provide heterogeneity
                   and scalability. Finally, the integration of the distributed
                   control into the multi-threaded kernel architecture of
                   Harness offers a fault-tolerant global state database service
                   for plug-ins and applications."
}
@mastersthesis{jones10simulation,
  author        = "Ian S. Jones",
  title         = "Simulation of Large Scale Architectures on High Performance
                   Computers",
  month         = oct # "~22, ",
  year          = "2010",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Advisors: Prof. Vassil N. Alexandrov (University of Reading);
                   Christian Engelmann (Oak Ridge National Laboratory);
                   George Bosilca (University of Tennessee, Knoxville)",
  url           = "http://www.christian-engelmann.info/publications/jones10simulation.pdf",
  url2          = "http://www.christian-engelmann.info/publications/jones10simulation.ppt.pdf",
  abstract      = "Powerful supercomputers often need to be simulated for the
                   purposes of testing the scalability of various applications.
                   This thesis endeavours to further develop the existing
                   simulator, XSIM, and implement the functionality to simulate
                   real-world networks and the latency which might be encountered
                   by messages travelling through that network. The upgraded
                   simulator will then be tested at the Oak Ridge National
                   Laboratory. The work completed herein should provide a solid
                   foundation for further improvements to XSIM; it simulates a
                   variety of basic network topologies, calculating the shortest
                   path for any given message and generates a transmission time."
}
@mastersthesis{boehm10development,
  author        = "Swen B{\"o}hm",
  title         = "Development of a {RAS} Framework for {HPC} Environments:
                   {Realtime} Data Reduction of Monitoring Data",
  month         = mar # "~12, ",
  year          = "2010",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Advisors: Prof. Vassil N. Alexandrov (University of Reading);
                   Christian Engelmann (Oak Ridge National Laboratory);
                   George Bosilca (University of Tennessee, Knoxville)",
  url           = "http://www.christian-engelmann.info/publications/boehm10development.pdf",
  url2          = "http://www.christian-engelmann.info/publications/boehm10development.ppt.pdf",
  abstract      = "The advancements of high-performance computing (HPC) systems
                   in the last decades lead to more and more complex systems
                   containing thousands or tens-of-thousands computing systems
                   that are working together. While the computational performance
                   of these systems increased dramaticaly in the last years the
                   I/O subsystems have not gained such a significant improvement.
                   With increasing nummbers of hardware components in the next
                   generation HPC systems maintaining the relaiability of such
                   systems becomes more and more difficult since the probability
                   of hardware failures is increasing with the number of
                   components. The capacities of traditional reactive fault
                   tolerance technologies are exceeded by the development of next
                   generation systems and alternatives have to be found. This
                   paper discusses a monitoring system that is using data
                   reduction techniques to decrease the amount of the collected
                   data. The system is part of a proactive fault tolerance system
                   that may challenge the reliability problems of exascale
                   HPC systems."
}
@mastersthesis{lauer10simulation,
  author        = "Frank Lauer",
  title         = "Simulation of Advanced Large-Scale {HPC} Architectures",
  month         = mar # "~12, ",
  year          = "2010",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Advisors: Prof. Vassil N. Alexandrov (University of Reading);
                   Christian Engelmann (Oak Ridge National Laboratory);
                   George Bosilca (University of Tennessee, Knoxville)",
  url           = "http://www.christian-engelmann.info/publications/lauer10simulation.pdf",
  url2          = "http://www.christian-engelmann.info/publications/lauer10simulation.ppt.pdf",
  abstract      = "The rapid development of massive parallel systems in the high-
                   performance computing (HPC) area requires efficient
                   scalability of applications. The next generation's design of
                   supercomputers is today not certain in terms of what will be
                   the computational, memory and I/O capabilities. However it is
                   most certain that they become even more parallel. Getting
                   the most performance from these machines in not only a matter
                   of hardware, it is also an issue of programming design.
                   Therefore, it has to be a co-development. However, how to test
                   algorithm's on machines which are not existing today. To
                   address the programming issues in terms of scalability and
                   fault tolerance for the next generation, this projects aim is
                   to design and develop a simulator based on parallel discrete
                   event simulation (PDES) for applications using MPI
                   communication. Some of the fastest supercomputers in the world
                   already interconnecting $10^5$ cores together to catch up the
                   simulator will be able to simulate at least $10^7$ virtual
                   processes."
}
@mastersthesis{litvinova09ras,
  author        = "Antonina Litvinova",
  title         = "{RAS} Framework Engine Prototype",
  month         = sep # "~22, ",
  year          = "2009",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Advisors: Prof. Vassil N. Alexandrov (University of Reading);
                   Christian Engelmann (Oak Ridge National Laboratory);
                   George Bosilca (University of Tennessee, Knoxville)",
  url           = "http://www.christian-engelmann.info/publications/litvinova09ras.pdf",
  url2          = "http://www.christian-engelmann.info/publications/litvinova09ras.ppt.pdf",
  abstract      = "Extreme high performance computing (HPC) systems constantly
                   increase in scale from a few thousands of processors cores
                   to thousands of thousands of processors cores and beyond.
                   However their system mean-time to interrupt decreases
                   according. The current approach of fault tolerance in HPC
                   is checkpoint/restart, i.e. a method based on recovery from
                   experienced failures. However checkpoint/restart cannot deal
                   with errors in the same efficient way anymore, because of
                   HPC systems modification. For example, increasing error
                   rates, increasing aggregate memory, and not proportionally
                   increasing input/output capabilities. The recently
                   introduced concept is proactive fault tolerance which
                   avoids experiencing failures through preventative measures.
                   Proactive fault tolerance uses migration which is an
                   emerging technology that prevents failures on HPC systems
                   by migrating applications or application parts away from
                   a node that is deteriorating to a spare node. This thesis
                   discusses work conducted at ORNL to develop a Proactive
                   Fault Tolerance Framework Engine Prototype for HPC systems
                   with high reliability, availability and serviceability.
                   The prototype performs environmental system monitoring,
                   system event logging, parallel job monitoring and system
                   resource monitoring in order to analyse HPC system
                   reliability and to perform fault avoidance through a
                   migration."
}
@mastersthesis{koenning07virtualized,
  author        = "Bj{\"o}rn K{\"o}nning",
  title         = "Virtualized Environments for the {Harness Workbench}",
  month         = mar # "~14, ",
  year          = "2007",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Advisors: Prof. Vassil N. Alexandrov (University of Reading);
                   Christian Engelmann (Oak Ridge National Laboratory)",
  url           = "http://www.christian-engelmann.info/publications/koenning07virtualized.pdf",
  url2          = "http://www.christian-engelmann.info/publications/koenning07virtualized.ppt.pdf",
  abstract      = "The expanded use of computational sciences today leads to a
                   significant need of high performance computing systems. High
                   performance computing is currently undergoing vigorous
                   revival, and multiple efforts are underway to develop much
                   faster computing systems in the near future. New software
                   tools are required for the efficient use of petascale
                   computing systems. With the new Harness Workbench Project
                   the Oak Ridge National Laboratory intends to develop an
                   appropriate development and runtime environment for high
                   performance computing platforms. This dissertation project
                   is part of the Harness Workbench Project, and deals with the
                   development of a concept for virtualised environments and
                   various approaches to create and describe them. The developed
                   virtualisation approach is based on the \verb|chroot|
                   mechanism and uses platform-independent environment
                   descriptions. File structures and environment variables are
                   emulated to provide the portability of computational software
                   over diverse high performance computing platforms. Security
                   measures and sandbox characteristic are integrable."
}
@mastersthesis{weber07high,
  author        = "Matthias Weber",
  title         = "High Availability for the {Lustre} File System",
  month         = mar # "~14, ",
  year          = "2007",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Double diploma in conjunction with the
                   \href{http://www.f1.fhtw-berlin.de}{Department of
                   Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
                   College for Engineering and Economics (FHTW) Berlin},
                   Germany. Advisors: Prof. Vassil N. Alexandrov (University of
                   Reading); Christian Engelmann (Oak Ridge National
                   Laboratory)",
  url           = "http://www.christian-engelmann.info/publications/weber07high.pdf",
  url2          = "http://www.christian-engelmann.info/publications/weber07high.ppt.pdf",
  abstract      = "With the growing importance of high performance computing
                   and, more importantly, the fast growing size of sophisticated
                   high performance computing systems, research in the area of
                   high availability is essential to meet the needs to sustain
                   the current growth. This Master thesis project aims to
                   improve the availability of Lustre. Major concern of this
                   project is the metadata server of the file system. The
                   metadata server of Lustre suffers from the last single point
                   of failure in the file system. To overcome this single point
                   of failure an active/active high availability approach is
                   introduced. The new file system design with multiple MDS
                   nodes running in virtual synchrony leads to a significant
                   increase of availability. Two prototype implementations aim
                   to show how the proposed system design and its new realized
                   form of symmetric active/active high availability can be
                   accomplished in practice. The results of this work point out
                   the difficulties in adapting the file system to the
                   active/active high availability design. Tests identify not
                   achieved functionality and show performance problems of the
                   proposed solution. The findings of this dissertation may be
                   used for further work on high availability for distributed
                   file systems."
}
@mastersthesis{baumann06design,
  author        = "Ronald Baumann",
  title         = "Design and Development of Prototype Components for the
                   {Harness} High-Performance Computing Workbench",
  month         = mar # "~6, ",
  year          = "2006",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Double diploma in conjunction with the
                   \href{http://www.f1.fhtw-berlin.de}{Department of
                   Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
                   College for Engineering and Economics (FHTW) Berlin},
                   Germany. Advisors: Prof. Vassil N. Alexandrov (University of
                   Reading); George A. (Al) Geist and Christian  Engelmann (Oak
                   Ridge National Laboratory)",
  url           = "http://www.christian-engelmann.info/publications/baumann06design.pdf",
  url2          = "http://www.christian-engelmann.info/publications/baumann06design.ppt.pdf",
  abstract      = "This master thesis examines plug-in technology, especially
                   the new field of parallel plug-ins. Plug-ins are popular
                   because they extend the capabilities of software packages
                   such as browsers and Photoshop, and allow an individual user
                   to add new functionality. Parallel plug-ins also provide the
                   above capabilities to a distributed set of resources, i.e.,
                   a plug-in now becomes a set of coordinating plug-ins. Second,
                   the set of plugins may be heterogeneous either in function or
                   because the underlying resources are heterogeneous. This new
                   dimension of complexity provides a rich research space which
                   is explored in this thesis. Experiences are collected and
                   presented as parallel plug-in paradigms and concepts. The
                   Harness framework was used in this project, in particular the
                   plugin manager and available communication capabilities.
                   Plug-ins provide methods for users to extend Harness
                   according to their requirements. The result of this thesis is
                   a parallel plug-in paradigm and template for Harness. Users
                   of the Harness environment will be able to design and
                   implement their applications in the form of parallel plug-ins
                   easier and faster by using the paradigm resulting from this
                   project. Prototypes were implemented which handle different
                   aspects of parallel plug-ins. Parallel plug-in configurations
                   were tested on an appropriate number of Harness kernels,
                   including available communication and error-handling
                   capabilities. Furthermore, research was done in the area of
                   fault tolerance while parallel plug-ins are (un)loaded, as
                   well as while a task is performed."
}
@mastersthesis{uhlemann06high,
  author        = "Kai Uhlemann",
  title         = "High Availability for High-End Scientific Computing",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  month         = mar # "~6, ",
  year          = "2006",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Double diploma in conjunction with the
                   \href{http://www.f1.fhtw-berlin.de}{Department of
                   Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
                   College for Engineering and Economics (FHTW) Berlin},
                   Germany. Advisors: Prof. Vassil N. Alexandrov (University of
                   Reading); George A. (Al) Geist and  Christian Engelmann (Oak
                   Ridge National Laboratory)",
  url           = "http://www.christian-engelmann.info/publications/uhlemann06high.pdf",
  url2          = "http://www.christian-engelmann.info/publications/uhlemann06high.ppt.pdf",
  abstract      = "With the growing interest and popularity in high performance
                   cluster computing and, more importantly, the fast growing
                   size of compute clusters, research in the area of high
                   availability is essential to meet the needs to sustain the
                   current growth. This Master thesis project introduces a new
                   approach for high availability focusing on the head node of a
                   cluster system. This projects focus is on providing high
                   availability to the job scheduler service, which is the most
                   vital part of the traditional Beowulf-style cluster
                   architecture. This research seeks to add high availability to
                   the job scheduler service and resource management system,
                   typically running on the head node, leading to a significant
                   increase of availability for cluster computing. Also, this
                   software project takes advantage of the virtual synchrony
                   paradigm to achieve active/active replication, the highest
                   form of high availability. A proof-of-concept implementation
                   shows how high availability can be designed in software and
                   what results can be expected of such a system. The results
                   may be reused for future or existing projects to further
                   improve and extent the high availability of compute
                   clusters."
}
@phdthesis{engelmann08symmetric3,
  author        = "Christian Engelmann",
  title         = "Symmetric Active/Active High Availability for
                   High-Performance Computing System Services",
  year          = "2008",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Advisor: Prof. Vassil N. Alexandrov (University of Reading)",
  url           = "http://www.christian-engelmann.info/publications/engelmann08symmetric3.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann08symmetric3.ppt.pdf",
  abstract      = "In order to address anticipated high failure rates,
                   reliability, availability and serviceability have become an
                   urgent priority for next-generation high-performance
                   computing (HPC) systems. This thesis aims to pave the way for
                   highly available HPC systems by focusing on their most
                   critical components and by reinforcing them with appropriate
                   high availability solutions. Service components, such as head
                   and service nodes, are the Achilles heel of a HPC system.
                   A failure typically results in a complete system-wide outage.
                   This thesis targets efficient software state replication
                   mechanisms for service component redundancy to achieve high
                   availability as well as high performance. Its methodology
                   relies on defining a modern theoretical foundation for
                   providing service-level high availability, identifying
                   availability deficiencies of HPC systems, and comparing
                   various service-level high availability methods. This thesis
                   showcases several developed proof-of-concept prototypes
                   providing high availability for services running on HPC head
                   and service nodes using the symmetric active/active
                   replication method, i.e., state-machine replication, to
                   complement prior work in this area using active/standby and
                   asymmetric active/active configurations. Presented
                   contributions include a generic taxonomy for service high
                   availability, an insight into availability deficiencies of
                   HPC systems, and a unified definition of service-level high
                   availability methods. Further contributions encompass a fully
                   functional symmetric active/active high availability
                   prototype for a HPC job and resource management service that
                   does not require modification of service, a fully functional
                   symmetric active/active high availability prototype for a HPC
                   parallel file system metadata service that offers high
                   performance, and two preliminary prototypes for a transparent
                   symmetric active/active replication software framework for
                   client-service and dependent service scenarios that hide the
                   replication infrastructure from clients and services.
                   Assuming a mean-time to failure of 5,000 hours for a head or
                   service node, all presented prototypes improve service
                   availability from 99.285\% to 99.995\% in a two-node system,
                   and to 99.99996\% with three nodes."
}
@mastersthesis{engelmann01distributed,
  author        = "Christian Engelmann",
  title         = "Distributed Peer-to-Peer Control for {Harness}",
  month         = jul # "~7, ",
  year          = "2001",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Double diploma in conjunction with the
                   \href{http://www.f1.fhtw-berlin.de}{Department of
                   Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
                   College for Engineering and Economics (FHTW) Berlin},
                   Germany. Advisors: Prof. Vassil N. Alexandrov (University of
                   Reading); George A. (Al) Geist (Oak Ridge National
                   Laboratory)",
  url           = "http://www.christian-engelmann.info/publications/engelmann01distributed.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann01distributed.ppt.pdf",
  abstract      = "Parallel processing, the method of cutting down a large
                   computational problem into many small tasks which are solved
                   in parallel, is a field of increasing importance in science.
                   Cost-effective, flexible and efficient simulations of
                   mathematical models of physical, chemical or biological
                   real-world problems are replacing the traditional
                   experimental research. Current software solutions for
                   parallel and scientific computation, like Parallel Virtual
                   Machine and Message Passing Interface, have limitations in
                   handling faults and failures, in utilizing heterogeneous and
                   dynamically changing communication structures, and in
                   enabling migrating or cooperative applications. The current
                   research in heterogeneous adaptable reconfigurable networked
                   systems (Harness) aims to produce the next generation of
                   software solutions for distributed computing. A
                   high-available and light-weighted distributed virtual
                   machine service provides an encapsulation of a few hundred
                   to a few thousand physical machines in a virtual
                   heterogeneous large scale cluster. A high availability of
                   a service in distributed systems can be achieved by
                   replication of the service state on multiple server
                   processes. If one ore more server processes fails, the
                   surviving ones continue to provide the service because they
                   know the state. Since every member of a distributed virtual
                   machine is part of the distributed virtual machine service
                   state and is able to change this state, a distributed control
                   is needed to replicate the state and maintain its
                   consistency. This distributed control manages state changes
                   as well as the state-replication and the detection of and
                   recovery from faults and failures of server processes. This
                   work analyzes system architectures currently used in
                   heterogeneous distributed computing by defining terms,
                   conditions and assumptions. It shows that such systems are
                   asynchronous and may use partially synchronous communication
                   to detect and to distinguish different classes of faults and
                   failures. It describes how a high availability of a large
                   scale distributed service on a huge number of servers
                   residing on different geographical locations can be realized.
                   Asynchronous group communication services, such as Reliable
                   Broadcast, Atomic Broadcast, Distributed Agreement and
                   Membership, are analyzed to develop linear scalable
                   algorithms in an unidirectional and in a bidirectional
                   connected asynchronous peer-to-peer ring architecture.
                   A Transaction Control group communication service is
                   introduced as state-replication service. The system analysis
                   distinguishes different types of distributed systems, where
                   active transactions execute state changes using
                   non-replicated data of one or more servers and inactive
                   transactions report state changes using replicated data only.
                   It is applicable for passive fault-tolerant distributed
                   databases as well as for active fault-tolerant distributed
                   control mechanisms. No control token is used and time stamps
                   are avoided, so that all members of a server group have equal
                   responsibilities and are independent from the system time.
                   A prototype which implements the most complicated Transaction
                   Control algorithm is realized due to the complexity of the
                   distributed system and the early development stage of the
                   introduced algorithms. The prototype is used to obtain
                   practical experience with the state-replication algorithm."
}
@mastersthesis{engelmann01distributed2,
  author        = "Christian Engelmann",
  title         = "Distributed Peer-to-Peer Control for {Harness}",
  month         = feb # "~23, ",
  year          = "2001",
  school        = "\href{http://www.f1.fhtw-berlin.de}{Department of
                   Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
                   College for Engineering and Economics (FHTW) Berlin},
                   Germany",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Double diploma in conjunction with the
                   \href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK. Advisors: Prof. Uwe Metzler (Technical College
                   for Engineering and Economics (FHTW) Berlin); George A. (Al)
                   Geist (Oak Ridge National Laboratory)",
  url           = "http://www.christian-engelmann.info/publications/engelmann01distributed2.pdf",
  url2          = "http://www.christian-engelmann.info/publications/engelmann01distributed2.ppt.pdf",
  abstract      = "Parallel processing, the method of cutting down a large
                   computational problem into many small tasks which are solved
                   in parallel, is a field of increasing importance in science.
                   Cost-effective, flexible and efficient simulations of
                   mathematical models of physical, chemical or biological
                   real-world problems are replacing the traditional
                   experimental research. Current software solutions for
                   parallel and scientific computation, like Parallel Virtual
                   Machine and Message Passing Interface, have limitations in
                   handling faults and failures, in utilizing heterogeneous and
                   dynamically changing communication structures, and in
                   enabling migrating or cooperative applications. The current
                   research in heterogeneous adaptable reconfigurable networked
                   systems (Harness) aims to produce the next generation of
                   software solutions for distributed computing. A
                   high-available and light-weighted distributed virtual
                   machine service provides an encapsulation of a few hundred
                   to a few thousand physical machines in a virtual
                   heterogeneous large scale cluster. A high availability of
                   a service in distributed systems can be achieved by
                   replication of the service state on multiple server
                   processes. If one ore more server processes fails, the
                   surviving ones continue to provide the service because they
                   know the state. Since every member of a distributed virtual
                   machine is part of the distributed virtual machine service
                   state and is able to change this state, a distributed control
                   is needed to replicate the state and maintain its
                   consistency. This distributed control manages state changes
                   as well as the state-replication and the detection of and
                   recovery from faults and failures of server processes. This
                   work analyzes system architectures currently used in
                   heterogeneous distributed computing by defining terms,
                   conditions and assumptions. It shows that such systems are
                   asynchronous and may use partially synchronous communication
                   to detect and to distinguish different classes of faults and
                   failures. It describes how a high availability of a large
                   scale distributed service on a huge number of servers
                   residing on different geographical locations can be realized.
                   Asynchronous group communication services, such as Reliable
                   Broadcast, Atomic Broadcast, Distributed Agreement and
                   Membership, are analyzed to develop linear scalable
                   algorithms in an unidirectional and in a bidirectional
                   connected asynchronous peer-to-peer ring architecture.
                   A Transaction Control group communication service is
                   introduced as state-replication service. The system analysis
                   distinguishes different types of distributed systems, where
                   active transactions execute state changes using
                   non-replicated data of one or more servers and inactive
                   transactions report state changes using replicated data only.
                   It is applicable for passive fault-tolerant distributed
                   databases as well as for active fault-tolerant distributed
                   control mechanisms. No control token is used and time stamps
                   are avoided, so that all members of a server group have equal
                   responsibilities and are independent from the system time.
                   A prototype which implements the most complicated Transaction
                   Control algorithm is realized due to the complexity of the
                   distributed system and the early development stage of the
                   introduced algorithms. The prototype is used to obtain
                   practical experience with the state-replication algorithm."
}
Comments are closed.