@inproceedings{1cd5f2e995f643559fee701e8cafef44,
title = "Risk intelligence: Profitting from uncertainty in data processing system",
abstract = "Fault-tolerance is essential in extreme-scale data processing systems. Pro-active fault-tolerance scheme (such as the speculative execution in MapReduce framework), can dramatically improve the response time of job executions when the failure becomes norm rather than an exception. Efficient pro-active fault-tolerance schemes require precise knowledge on the task executions, which has been an open challenges for decades. To well address the issue, in this paper we design and implement RiskI, a profile-based prediction algorithm in conjunction with a risk-aware task assignment algorithm to accelerate task executions, taking the uncertainty nature of tasks into account. Our design demonstrates that the nature uncertain not only brings great challenges but also new opportunities. With a careful design, we can benefit from such uncertainties. We implement the idea in Hadoop 0.21.0 systems and the experimental results show that compared with the traditional LATE algorithm, the response time can be improved by 46% with the same system throughput.",
keywords = "Data processing systems, Fault-tolerance, MapReduce, Prediction, Risk-management, Task assignment",
author = "Si Zheng and Yunhuai Liu and Shanshan Li and Tian He and Xiangke Liao",
year = "2013",
doi = "10.1109/ICPP.2013.55",
language = "English (US)",
isbn = "9780769551173",
series = "Proceedings of the International Conference on Parallel Processing",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "458--467",
booktitle = "Proceedings",
note = "42nd Annual International Conference on Parallel Processing, ICPP 2013 ; Conference date: 01-10-2013 Through 04-10-2013",
}