BibTeX bibliography file: Parallel I/O Ninth Edition: ADDITIONS only February 22, 1997 This bibliography is available on the WWW at http://www.cs.dartmouth.edu/pario/bib/ and by ftp at ftp://ftp.cs.dartmouth.edu/pub/pario/pario.bib Both of which are easily reached by the Parallel-I/O Archive at http://www.cs.dartmouth.edu/pario/ This bibliography covers parallel I/O, with a significant emphasis on file systems rather than, say, network or graphics I/O. This includes architecture, operating systems, some algorithms, some databases, and some workload characterization. Because of the expanding nature of this field, I cannot cover everything, and this bibliography is admittedly spotty on topics like disk arrays, parallel databases, and parallel networking. The entries are alphabetized by cite key. The emphasis is on including everything I have, rather than selecting a few key articles of interest. Thus, you probably don't want (or need) to read everything here. There are many repeated entries, in the sense that a paper is often published first as a TR, then in a conference, then in a journal. The "earlier" and "later" tags tie together versions of a paper. Except where noted, all comments are mine, and any opinions expressed there are mine only. In some cases I am simply restating the opinion or result obtained by the paper's authors, and thus even I might disagree with the statement. I keep most editorial comments to a minimum. Please send any additions or corrections (new abstracts and URLs would be great!) to parallel-io-bib@dartmouth.edu. Indeed, if you want to get updates to the bibliography (released once per week), subscribe to that mailing list by sending a message to majordomo@dartmouth.edu whose BODY says subscribe parallel-io-bib You may use the bibliography as you please except for publishing it as a whole, since the compilation is mine. Please leave this header on the collection; BibTeX won't mind. David Kotz Associate Professor Department of Computer Science Dartmouth College 6211 Sudikoff Laboratory Hanover, NH 03755-3510 USA URL: http://www.cs.dartmouth.edu/~dfk/ 603-646-1439 @string {email = "dfk@cs.dartmouth.edu"} % have to hide this from bibtex %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% AGAIN, this file has ONLY the ADDITIONS to the bibliography between the eight and ninth edition. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @InProceedings{agrawal:asynch, author = {Gagan Agrawal and Anurag Acharya and Joel Saltz}, title = {An Interprocedural Framework for Placement of Asynchronous {I/O} Operations}, booktitle = {Proceedings of the 10th ACM International Conference on Supercomputing}, year = {1996}, month = {May}, pages = {358--365}, publisher = {ACM Press}, address = {Philadelphia, PA}, keyword = {compiler, I/O, pario-bib}, comment = {Not really about parallel applications or parallel I/O, but I think it may be of interest to that community. They propose a compiler framework for a compiler to insert asynchronous I/O operations (start I/O, finish I/O), to satisfy the dependency constraints of the program.} } @PhdThesis{ap:thesis, author = {Apratim Purakayastha}, title = {Characterizing and Optimizing Parallel File Systems}, year = {1996}, month = {June}, school = {Dept. of Computer Science, Duke University}, address = {Durham, NC}, note = {Also available as technical report CS-1996-10}, URL = {ftp://ftp.cs.duke.edu/dist/techreport/1996/1996-10.ps.gz}, keyword = {parallel I/O, multiprocessor file system, file access patterns, workload characterization, file caching, disk-directed I/O, pario-bib}, abstract = {High-performance parallel file systems are needed to satisfy tremendous I/O requirements of parallel scientific applications. The design of such parallel file systems depends on a comprehensive understanding of the expected workload, but so far there have been very few usage studies of multiprocessor file systems. In the first part of this dissertation, we attempt to fill this void by measuring a real file-system workload on a production parallel machine, namely the CM-5 at the National Center for Supercomputing Applications. We collect information about nearly every individual I/O request from the mix of jobs running on the machine. Analysis of the traces leads to various recommendations for design of future parallel file systems. Our usage study showed that writes to write-only files are a dominant part of the workload. Therefore, optimizing writes could have a significant impact on overall performance. In the second part of this dissertation, we propose ENWRICH, a compute-processor write-caching scheme for write-only files in parallel file systems. Within its framework, ENWRICH uses a recently proposed high performance implementation of collective I/O operations called disk-directed I/O, but it eliminates a number of limitations of disk-directed I/O. ENWRICH combines low-overhead write caching at the compute processors with high performance disk-directed I/O at the I/O processors to achieve both low latency and high bandwidth. This combination facilitates the use of the powerful disk-directed I/O technique independent of any particular choice of interface, and without the requirement for mapping libraries at the I/O processors. By collecting writes over many files and applications, ENWRICH lets the I/O processors optimize disk I/O over a large pool of requests. We evaluate our design of ENWRICH using simulated implementation and extensive experimentation. We show that ENWRICH achieves high performance for various configurations and workloads. We pinpoint the reasons for ENWRICH`s failure to perform well for certain workloads, and suggest possible enhancements. Finally, we discuss the nuances of implementing ENWRICH on a real platform and speculate about possible adaptations of ENWRICH for emerging multiprocessing platforms.}, comment = {See also ap:enwrich, ap:workload, and nieuwejaar:workload} } @InProceedings{arunachalam:prefetch2, author = {Meenkashi Arunachalam and Alok Choudhary and Brad Rullman}, title = {Implementation and evaluation of prefetching in the {Intel Paragon Parallel File System}}, booktitle = {Proceedings of the Tenth International Parallel Processing Symposium}, year = {1996}, month = {April}, pages = {554--559}, URL = {http://www.ece.nwu.edu/~meena/papers/ipps.ps}, keyword = {parallel I/O, prefetching, multiprocessor file system, pario-bib}, abstract = {The significant difference between the speeds of the I/O system (e.g., disks) and compute processors in parallel systems creates a bottleneck that lowers the performance of an application that does a considerable amount of disk accesses. A major portion of the compute processors' time is wasted on waiting for I/O to complete. This problem can be addressed to a certain extent, if the necessary data can be fetched from the disk before the I/O call to the disk is issued. Fetching data ahead of time, known as prefetching in a multiprocessor environment depends a great deal on the application's access pattern. The subject of this paper is implementation and performance evaluation of a prefetching prototype in a production parallel file system on the Intel Paragon. Specifically, this paper presents a) design and implementation of a prefetching strategy in the parallel file system and b) performance measurements and evaluation of the file system with and without prefetching. The prototype is designed at the operating system level for the PFS. It is implemented in the PFS subsystem of the Intel Paragon Operating System. It is observed that in many cases prefetching provides considerable performance improvements. In some other cases no improvements or some performance degradation is observed due to the overheads incurred in prefetching.}, comment = {See arunachalam:prefetch.} } @TechReport{bordawekar:collective, author = {Rajesh Bordawekar}, title = {Implementation and Evaluation of Collective {I/O} in the {Intel Paragon Parallel File System}}, year = {1996}, month = {November}, number = {CACR~TR-128}, institution = {Center of Advanced Computing Research, California Insititute of Technology}, URL = {http://www.cacr.caltech.edu/~rajesh/collective.html}, keyword = {parallel I/O, mutliprocessor file system, pario-bib}, abstract = {A majority of parallel applications obtain parallelism by partitioning data over multiple processors. Accessing distributed data structures like arrays from files often requires each processor to make a large number of small non-contiguous data requests. This problem can be addressed by replacing small non-contiguous requests by large collective requests. This approach, known as Collective I/O, has been found to work extremely well in practice. In this paper, we describe implementation and evaluation of a collective I/O prototype in a production parallel file system on the Intel Paragon. The prototype is implemented in the PFS subsystem of the Intel Paragon Operating System. We evaluate the collective I/O performance using its comparison with the PFS M_RECORD and M_UNIX I/O modes. It is observed that collective I/O provides significant performance improvement over accesses in M_UNIX mode. However, in many cases, various implementation overheads cause collective I/O to provide lower performance than the M_RECORD I/O mode.} } @TechReport{bordawekar:compcomm-tr, author = {Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {Compilation and Communication Strategies for Out-of-core programs on Distributed Memory Machines}, year = {1995}, month = {November}, number = {CACR-113}, institution = {Scalable I/O Initiative, Center of Advanced Computing Research, California Insititute of Technology}, later = {bordawekar:compcomm}, URL = {http://www.cat.syr.edu/~rajesh/cacr113.ps}, abstract = {It is widely acknowledged that improving parallel I/O performance is critical for widespread adoption of high performance computing. In this paper, we show that communication in out-of-core distributed memory problems may require both inter-processor communication and file I/O. Thus, in order to improve I/O performance, it is necessary to minimize the I/O costs associated with a communication step. We present three methods for performing communication in out-of-core distributed memory problems. The first method called the generalized collective communication method follows a loosely synchronous model; computation and communication phases are clearly separated, and communication requires permutation of data in files. The second method called the receiver-driven in-core communication considers only communication required of each in-core data slab individually. The third method called the owner-driven in-core communication goes even one step further and tries to identify the potential future use of data (by the recipients) while it is in the sender's memory. We describe these methods in detail and present a simple heuristic to choose a communication method from among the three methods. We then provide performance results for two out-of-core applications, the two-dimensional FFT code and the two-dimensional elliptic Jacobi solver. Finally, we discuss how the out-of-core and in-core communication methods can be used in virtual memory environments on distributed memory machines.}, comment = {See also bordawekar:comm, at ICS'95.} } @TechReport{bordawekar:placement-tr, author = {Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {A Framework for Integrated Communication and {I/O} Placement}, year = {1996}, month = {February}, number = {CACR-117}, institution = {Scalable I/O Initiative, Center of Advanced Computing Research, California Insititute of Technology}, later = {bordawekar:placement}, URL = {http://www.cacr.caltech.edu/~rajesh/cacr117.ps}, keyword = {parallel I/O, compiler, pario-bib}, abstract = {In this paper, we describe a framework for optimizing communication and I/O costs in out-of-core problems. We focus on communication and I/O optimization within a FORALL construct. We show that existing frameworks do not extend directly to out-of-core problems and can not exploit the FORALL semantics. We present a unified framework for the placement of I/O and communication calls and apply it for optimizing communication for stencil applications. Using the experimental results, we demonstrate that correct placement of I/O and communication calls can completely eliminate extra file I/O from communication and obtain significant performance improvement.} } @InProceedings{brezany:architecture, author = {Peter Brezany and Thomas A. Mueck and Erich Schikuta}, title = {A Software Architecture for Massively Parallel Input-Output}, booktitle = {Third International Workshop PARA'96 (Applied Parallel Computing - Industrial Computation and Optimization)}, year = {1996}, month = {August}, series = {Lecture Notes in Computer Science}, volume = {1186}, pages = {85--96}, publisher = {Springer-Verlag}, address = {Lyngby, Denmark}, note = {Also available as Technical Report of the Inst. f.~Angewandte Informatik u. Informationssysteme, University of Vienna, TR~96202}, URL = {http://www.pri.univie.ac.at/~schiki/research/paper/para96/para96.ps}, keyword = {compiler transformations, runtime support, parallel I/O, prefetching, pario-bib}, abstract = {For an increasing number of data intensive scientific applications, parallel I/O concepts are a major performance issue. Tackling this issue, we provide an outline of an input/output system designed for highly efficient, scalable and conveniently usable parallel I/O on distributed memory systems. The main focus of this paper is the parallel I/O runtime system support provided for software-generated programs produced by parallelizing compilers in the context of High Performance FORTRAN efforts. Specifically, our design is presented in the context of the Vienna Fortran Compilation System.} } @InProceedings{brezany:compiling, author = {Peter Brezany and Thomas A. Mueck and Erich Schikuta}, title = {Mass Storage Support for a Parallelizing Compilation System}, booktitle = {International Conference Eurosim'96-- HPCN challenges in Telecomp and Telecom: Parallel Simulation of Complex Systems and Large Scale Applications}, year = {1996}, month = {June}, pages = {63--70}, publisher = {North-Holland, Elsevier Science}, address = {Delft, The Netherlands}, URL = {http://www.pri.univie.ac.at/~schiki/research/paper/eurosim96/eurosim96.ps}, keyword = {parallel I/O, high performance mass storage system, high performance languages, compilation techniques, data administration, pario-bib} } @TechReport{brezany:irregular-tr, author = {P. Brezany and A. Choudhary}, title = {Techniques and Optimizations for Developing Irregular Out-of-Core Applications on Distributed-Memory Systems}, year = {1996}, month = {November}, number = {96-4}, institution = {Institute for Software Technology and Parallel Systems, University of Vienna}, URL = {http://www.pri.univie.ac.at/~schiki/research/vipios/paper/brezany-choudhary.ps}, keyword = {parallel I/O, out of core, irregular applications, compiler, pario-bib} } @TechReport{cao:tickertaip-tr, author = {Pei Cao and Swee Boon Lim and Shivakumar Venkataraman and John Wilkes}, title = {The {TickerTAIP} parallel {RAID} architecture}, year = {1992}, month = {December}, number = {HPL-92-151}, institution = {HP Labs}, later = {cao:tickertaip-tr2}, keyword = {parallel I/O, RAID, pario-bib}, comment = {A parallelized RAID architecture that distributes the RAID controller operations across several worker nodes. Multiple hosts can connect to different workers, allowing multiple paths into the array. The workers then communicate on their own fast interconnect to accomplish the requests, distributing parity computations across multiple workers. They get much better performance and reliability than plain RAID. They built a prototype and a performance simulator. Two-phase commit was needed for request atomicity, and a request sequencer was needed for serialization. Also found it was good to give the whole request info to all workers and to let them figure out what to do and when. Superceded by cao:tickertaip-tr2 and cao:tickertaip.} } @InProceedings{carretero:mapping, author = {J. Carretero and F. P\'{e}rez and P. {de Miguel} and F. Garc\'{\i}a and L. Alonso}, title = {{I/O} Data Mapping in {{\em ParFiSys:}} Support for High-Performance {I/O} in Parallel and Distributed Systems}, booktitle = {Euro-Par~'96}, year = {1996}, month = {August}, series = {Lecture Notes in Computer Science}, volume = {1123}, pages = {522--526}, publisher = {Springer-Verlag}, URL = {http://laurel.datsi.fi.upm.es/~gp/publications/europar96.ps.Z}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, abstract = {This paper gives an overview of the I/O data mapping mechanisms of {\em ParFiSys}. Grouped management and parallelization are presented as relevant features. I/O data mapping mechanisms of {\em ParFiSys}, including all levels of the hierarchy, are described in this paper.} } @Article{carretero:subsystem, author = {J. Carretero and F. {P\'erez} and P. de~Miguel and F. {Garc\'{\i}a} and L. Alonso}, title = {A Massively Parallel and Distributed {I/O} Subsystem}, journal = {Computer Architecture News}, year = {1996}, month = {June}, volume = {24}, number = {3}, pages = {1--8}, keyword = {parallel I/O, I/O architecture, pario-bib}, comment = {See carretero:*, rosales:cds, perez:clfs.} } @InProceedings{chehadeh:oodb, author = {Y.~C. Chehadeh and A.~R. Hurson and L.~L. Miller and S. Pakzad and B.~N. Jamoussi}, title = {Application for parallel disks for efficient handling of object-oriented databases}, booktitle = {Proceedings of the 1993 IEEE Symposium on Parallel and Distributed Processing}, year = {1993}, pages = {184--191}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, disk array, object oriented database, pario-bib}, abstract = {In today's workstation based environment, applications such as design databases, multimedia databases, and knowledge bases do not fit well into the relational data processing framework. The object-oriented data model has been proposed to model and process such complex databases. Due to the nature of the supported applications, object-oriented database systems need efficient mechanisms for the retrieval of complex objects and the navigation along the semantic links among objects. Object clustering and buffering have been suggested as efficient mechanisms for the retrieval of complex objects. However, to improve the efficiency of the aforementioned operations, one has to look at the recent advances in storage technology. This paper is an attempt to investigate the feasibility of using parallel disks for object-oriented databases. It analyzes the conceptual changes needed to map the clustering and buffering schemes proposed on the new underlying architecture. The simulation and performance evaluation of the proposed leveled-clustering and mapping schemes utilizing parallel I/O disks are presented and analyzed.} } @InProceedings{chen:panda-model, author = {Y. Chen and M. Winslett and S. Kuo and Y. Cho and M. Subramaniam and K. E. Seamons}, title = {Performance Modeling for the {Panda} Array {I/O} Library}, booktitle = {Proceedings of Supercomputing '96}, year = {1996}, month = {November}, publisher = {ACM Press and IEEE Computer Society Press}, URL = {http://www.supercomp.org/sc96/proceedings/SC96PROC/YING/INDEX.HTM}, keyword = {performance modeling, parallel I/O, pario-bib}, abstract = {We present an analytical performance model for Panda, a library for synchronized i/o of large multidimensional arrays on parallel and sequential platforms, and show how the Panda developers use this model to evaluate Panda's parallel i/o performance and guide future Panda development. The model validation shows that system developers can simplify performance analysis, identify potential performance bottlenecks, and study the design trade-offs for Panda on massively parallel platforms more easily than by conducting empirical experiments. More importantly, we show that the outputs of the performance model can be used to help make optimal plans for handling application i/o requests, the first step toward our long-term goal of automatically optimizing i/o request handling in Panda.}, comment = {Web and CDROM only.} } @Article{chen:raid-perf, author = {S. Chen and D. Towsley}, title = {A Performance Evaluation of {RAID} Architectures}, journal = {IEEE Transactions on Computers}, year = {1996}, month = {October}, volume = {45}, number = {10}, pages = {1116}, publisher = {IEEE Computer Society Press}, keyword = {verify pages, parallel I/O, RAID, disk array, pario-bib} } @InProceedings{chiang:graph, author = {Yi-Jen Chiang and and Michael T. Goodrich and Edward F. Grove and Roberto Tamassia and Darren Erik Vengroff and Jeffrey Scott Vitter}, title = {External-Memory Graph Algorithms (Extended Abstract)}, booktitle = {Proceedings of the ACM-SIAM Symposium on Discrete Algorithms (SODA '95)}, year = {1995}, month = {January}, pages = {139--149}, URL = {ftp://cs.duke.edu/pub/jsv/Papers/CGG95.external_graph.ps.Z}, abstract = {We present a collection of new techniques for designing and analyzing efficient external-memory algorithms for graph problems and illustrate how these techniques can be applied to a wide variety of specific problems. Our results include: \begin{itemize} \item {\em Proximate-neighboring}. We present a simple method for deriving external-memory lower bounds via reductions from a problem we call the ``proximate neighbors'' problem. We use this technique to derive non-trivial lower bounds for such problems as list ranking, expression tree evaluation, and connected components. \item {\em PRAM simulation}. We give methods for efficiently simulating PRAM computations in external memory, even for some cases in which the PRAM algorithm is not work-optimal. We apply this to derive a number of optimal (and simple) external-memory graph algorithms. \item {\em Time-forward processing}. We present a general technique for evaluating circuits (or ``circuit-like'' computations) in external memory. We also use this in a deterministic list ranking algorithm. \item {\em Deterministic 3-coloring of a cycle}. We give several optimal methods for 3-coloring a cycle, which can be used as a subroutine for finding large independent sets for list ranking. Our ideas go beyond a straightforward PRAM simulation, and may be of independent interest. \item {\em External depth-first search}. We discuss a method for performing depth first search and solving related problems efficiently in external memory. Our technique can be used in conjunction with ideas due to Ullman and Yannakakis in order to solve graph problems involving closed semi-ring computations even when their assumption that vertices fit in main memory does not hold. \end{itemize} \par Our techniques apply to a number of problems, including list ranking, which we discuss in detail, finding Euler tours, expression-tree evaluation, centroid decomposition of a tree, least-common ancestors, minimum spanning tree verification, connected and biconnected components, minimum spanning forest, ear decomposition, topological sorting, reachability, graph drawing, and visibility representation.} } @InProceedings{chiung-san:xdac, author = {Lee Chiung-San and Parng Tai-Ming and Lee Jew-Chin and Tsai Cheng-Nan and Farn Kwo-Jean}, title = {Performance analysis of the {XDAC} disk array system}, booktitle = {Proceedings of the 1994 IEEE Symposium on Parallel and Distributed Processing}, year = {1994}, pages = {620--627}, publisher = {IEEE Computer Society Press}, keyword = {disk array, performance evaluation, analytical model, parallel I/O, pario-bib}, abstract = {The paper presents an analytical model of a whole disk array architecture, XDAC, which consists of several major subsystems and features: the two-dimensional array structure; IO-bus with split transaction protocol; and cache for processing multiple I/O requests in parallel. Our modelling approach is based on a subsystem access time per request (SATPR) concept, in which we model for each subsystem the mean access time per disk array request. The model is fed with a given set of representative workload parameters and then used to conduct performance analysis for exploring the impact of fork/join synchronization as well as evaluating some architectural design issues of the XDAC system. Moreover, by comparing the SATPRs of subsystems, we can identify the bottleneck for performance improvements.} } @Article{choudhary:sdcr, author = {Alok Choudhary and David Kotz}, title = {Large-Scale File Systems with the Flexibility of Databases}, journal = {ACM Computing Surveys}, year = {1996}, month = {December}, volume = {28A}, number = {4}, note = {Position paper for the Working Group on Storage I/O for Large-Scale Computing, ACM Workshop on Strategic Directions in Computing Research. Available on-line only, at http://www.acm.org/surveys/1996/ChoudharyFile/}, URL = {http://www.acm.org/surveys/1996/ChoudharyFile/}, keyword = {file system, database, parallel I/O, pario-bib}, comment = {A position paper for the Strategic Directions in Computer Research workshop at MIT in June 1996.} } @InProceedings{chung-sheng:arrays, author = {Li Chung-Sheng and Chen Ming-Syan and P.~S. Yu and Hsiao Hui-I}, title = {Combining replication and parity approaches for fault-tolerant disk arrays}, booktitle = {Proceedings of the 1994 IEEE Symposium on Parallel and Distributed Processing}, year = {1994}, pages = {360--367}, publisher = {IEEE Computer Society Press}, keyword = {fault tolerance, disk array, replication, declustering, parallel I/O, pario-bib}, abstract = {We explore the method of combining the replication and parity approaches to tolerate multiple disk failures in a disk array. In addition to the conventional mirrored and chained declustering methods, a method based on the hybrid of mirrored-and-chained declustering is explored. A performance study that explores the effect of combining replication and parity approaches is conducted. It is experimentally shown that the proposed approach can lead to the most cost-effective solution if the objective is to sustain the same load as before the failures.}, comment = {Consider hybrid chained and mirrored declustering.} } @Article{corbett:jvesta, author = {Peter F. Corbett and Dror G. Feitelson}, title = {The {Vesta} Parallel File System}, journal = {ACM Transactions on Computer Systems}, year = {1996}, month = {August}, volume = {14}, number = {3}, pages = {225--264}, publisher = {ACM Press}, earlier = {corbett:vesta}, keyword = {multiprocessor file system, Vesta, parallel I/O, pario-bib}, comment = {See also corbett:pfs, corbett:vesta*, feitelson:pario. This is the ultimate Vesta reference. There seem to be only a few small things that are completely new over what's been published elsewhere, although this presentation is much more complete and polished.} } @Misc{corbett:sio-api1.0, author = {Peter F. Corbett and Jean-Pierre Prost and Chris Demetriou and Garth Gibson and Erik Reidel and Jim Zelenka and Yuqun Chen and Ed Felten and Kai Li and John Hartman and Larry Peterson and Brian Bershad and Alec Wolman and Ruth Aydt}, title = {Proposal for a Common Parallel File System Programming Interface}, year = {1996}, month = {September}, howpublished = {WWW http://www.cs.arizona.edu/sio/api1.0.ps}, note = {Version 1.0.}, URL = {http://www.cs.arizona.edu/sio}, keyword = {parallel I/O, multiprocessor file system interface, pario-bib}, comment = {Specs of the proposed SIO low-level interface for parallel file systems. Key features: linear file model, scatter-gather read and write calls (list of strided segments), asynch versions of all calls, extensive hint system. Naming structure is unspecified; no directories specified. Permissions left out. Some control over client caching and over disk layout. Each file has a (small) 'label', which is just a little space for application-controlled meta data. Optional extensions: collective read and write calls, fast copy.} } @TechReport{cormen:early-vic, author = {Thomas H. Cormen and Melissa Hirschl}, title = {Early Experiences in Evaluating the Parallel Disk Model with the {ViC*} Implementation}, year = {1996}, month = {August}, number = {PCS-TR96-293}, institution = {Dept. of Computer Science, Dartmouth College}, note = {To appear in {\em Parallel Computing.}}, URL = {http://www.cs.dartmouth.edu/reports/abstracts/TR96-293/}, keyword = {parallel I/O, parallel I/O algorithm, compiler, pario-bib}, abstract = {Although several algorithms have been developed for the Parallel Disk Model (PDM), few have been implemented. Consequently, little has been known about the accuracy of the PDM in measuring I/O time and total time to perform an out-of-core computation. This paper analyzes timing results on a uniprocessor with several disks for two PDM algorithms, out-of-core radix sort and BMMC permutations, to determine the strengths and weaknesses of the PDM. \par The results indicate the following. First, good PDM algorithms are usually not I/O bound. Second, of the four PDM parameters, two (problem size and memory size) are good indicators of I/O time and running time, but the other two (block size and number of disks) are not. Third, because PDM algorithms tend not to be I/O bound, asynchronous I/O effectively hides I/O times. \par The software interface to the PDM is part of the ViC* run-time library. The interface is a set of wrappers that are designed to be both efficient and portable across several parallel file systems and target machines.} } @Article{cormen:fft, author = {Thomas H. Cormen and David M. Nicol}, title = {Performing Out-of-Core {FFTs} on Parallel Disk Systems}, journal = {Parallel Computing}, year = {1997}, note = {To appear; currently available as Dartmouth Technical Report PCS-TR96-294}, earlier = {cormen:fft-tr}, keyword = {verify month number volume and pages, parallel I/O, out of core, scientific computing, FFT, pario-bib} } @TechReport{cormen:fft-tr, author = {Thomas H. Cormen and David M. Nicol}, title = {Performing Out-of-Core {FFTs} on Parallel Disk Systems}, year = {1996}, number = {PCS-TR96-294}, institution = {Dept. of Computer Science, Dartmouth College}, later = {cormen:fft}, URL = {http://www.cs.dartmouth.edu/reports/abstracts/TR96-294/}, keyword = {parallel I/O, out of core, scientific computing, FFT, pario-bib}, abstract = {The Fast Fourier Transform (FFT) plays a key role in many areas of computational science and engineering. Although most one-dimensional FFT problems can be entirely solved entirely in main memory, some important classes of applications require out-of-core techniques. For these, use of parallel I/O systems can improve performance considerably. This paper shows how to perform one-dimensional FFTs using a parallel disk system with independent disk accesses. We present both analytical and experimental results for performing out-of-core FFTs in two ways: using traditional virtual memory with demand paging, and using a provably asymptotically optimal algorithm for the Parallel Disk Model (PDM) of Vitter and Shriver. When run on a DEC 2100 server with a large memory and eight parallel disks, the optimal algorithm for the PDM runs up to 144.7 times faster than in-core methods under demand paging. Moreover, even including I/O costs, the normalized times for the optimal PDM algorithm are competitive, or better than, those for in-core methods even when they run entirely in memory.} } @TechReport{cormen:fft2-tr, author = {Thomas H. Cormen and Jake Wegmann and David M. Nicol}, title = {Multiprocessor Out-of-Core {FFTs} with Distributed Memory and Parallel Disks}, year = {1997}, number = {PCS-TR97-303}, institution = {Dept. of Computer Science, Dartmouth College}, note = {Submitted to SPAA'97.}, URL = {http://www.cs.dartmouth.edu/reports/abstracts/TR97-303/}, keyword = {parallel I/O, out of core, scientific computing, FFT, pario-bib}, abstract = {This paper extends an earlier out-of-core Fast Fourier Transform (FFT) method for a uniprocessor with the Parallel Disk Model (PDM) to use multiple processors. Four out-of-core multiprocessor methods are examined. Operationally, these methods differ in the size of "mini-butterfly" computed in memory and how the data are organized on the disks and in the distributed memory of the multiprocessor. The methods also perform differing amounts of I/O and communication. Two of them have the remarkable property that even though they are computing the FFT on a multiprocessor, all interprocessor communication occurs outside the mini-butterfly computations. Performance results on a small workstation cluster indicate that except for unusual combinations of problem size and memory size, the methods that do not perform interprocessor communication during the mini-butterfly computations require approximately 86\% of the time of those that do. Moreover, the faster methods are much easier to implement.}, comment = {Extends the work of cormen:fft.} } @TechReport{cortes:paca-tr, author = {Toni Cortes and Sergi Girona and Jes\'us Labarta}, title = {{PACA}: A Cooperative File System Cache for Parallel Machines}, year = {1996}, number = {96-07}, institution = {UPC-CEPBA}, later = {cortes:paca}, URL = {ftp://ftp.ac.upc.es/pub/reports/CEPBA/1996/UPC-CEPBA-1996-7.ps.Z}, keyword = {file caching, multiprocessor file system, cooperative caching, parallel I/O, pario-bib}, comment = {See cortes:paca.} } @InProceedings{cortes:pafs, author = {Toni Cortes and Sergi Girona and Jes\'us Labarta}, title = {Avoiding the Cache-Coherence Problem in a Parallel/Distributed File System}, booktitle = {Proceedings of the High-Performace Computing and Networking}, year = {1997}, month = {April}, keyword = {verify pages, file caching, multiprocessor file system, cooperative caching, cache coherence, parallel I/O, pario-bib}, abstract = {In this paper we describe PAFS, a new parallel/distributed file system. Within the whole file system, special interest is placed on the caching mechanism. We present a cooperative cache that has the advantages of cooperation and avoids the problems derived from the coherence mechanisms. Furthermore, this has been achieved with a reasonable gain in performance. In order to show the obtained performance, we present a comparison between PAFS and xFS (a file system that also implements a cooperative cache).}, comment = {Contact toni@ac.upc.es.} } @Article{cypher:jrequire, author = {Robert Cypher and Alex Ho and Smaragda Konstantinidou and Paul Messina}, title = {A Quantitative Study of Parallel Scientific Applications with Explicit Communication}, journal = {Journal of Supercomputing}, year = {1996}, month = {March}, volume = {10}, number = {1}, pages = {5--24}, earlier = {cypher:require}, keyword = {workload characterization, scientific computing, parallel programming, message passing, pario-bib}, comment = {Some mention of I/O.} } @InProceedings{demmel:eosdis, author = {James Demmel and Melody Y. Ivory and Sharon L. Smith}, title = {Modeling and Identifying Bottlenecks in {EOSDIS}}, booktitle = {Proceedings of the Sixth Symposium on the Frontiers of Massively Parallel Computation}, year = {1996}, month = {October}, pages = {300--308}, publisher = {IEEE Computer Society Press}, keyword = {climate modeling, performance modeling, parallel I/O, pario-bib}, abstract = {Many parallel application areas that exploit massive parallelism, such as climate modeling, require massive storage systems for the archival and retrieval of data sets. As such, advances in massively parallel computation must be coupled with advances in mass storage technology in order to satisfy I/O constraints of these applications. We demonstrate the effects of such I/O-computation disparity for a representative distributed information system, NASA's Earth Observing System Distributed Information System (EOSDIS). We use performance modeling to identify bottlenecks in EOSDIS for two representative user scenarios from climate change research.} } @InProceedings{fineberg:pmpio, author = {{PMPIO}--- A Portable Implementation of {MPI-IO}}, title = {Samuel A. Fineberg and Parkson Wong and Bill Nitzberg and Chris Kuszmaul}, booktitle = {Proceedings of the Sixth Symposium on the Frontiers of Massively Parallel Computation}, year = {1996}, month = {October}, pages = {188--195}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, pario-bib}, abstract = {MPI-IO provides a demonstrably efficient portable parallel Input/Output interface, compatible with the MPI standard. PMPIO is a "reference implementation" of MPI-IO, developed at NASA Ames Research Center. To date, PMPIO has been ported to the IBM SP-2, SGI and Sun shared memory workstations, the Intel Paragon, and the Cray J90. Preliminary results using the PMPIO implementation of MPI-IO show an improvement of as much as a factor of 20 on the NAS BTIO benchmark compared to a Fortran based implementation. We show comparative results on the SP-2 Paragon, and SGI architectures.} } @TechReport{gibson:nasd-tr, author = {Garth A. Gibson and David P. Nagle and Khalil Amiri and Fay W. Chang and Eugene Feinberg and Howard Gobioff Chen Lee and Berend Ozceri and Erik Riedel and David Rochberg}, title = {A Case for Network-Attached Secure Disks}, year = {1996}, month = {June}, number = {CMU--CS-96-142}, institution = {Carnegie-Mellon University}, URL = {http://www.cs.cmu.edu/Groups/NASD/ARPA96/OSDI-TR-new-FInal.ps}, keyword = {parallel I/O, network-attached storage, distributed file systems, pario-bib}, comment = {See http://www.cs.cmu.edu/Groups/NASD/ARPA96/server.html} } @InProceedings{golubchik:striping, author = {Leana Golubchik and Richard R. Muntz and Richard W. Watson}, title = {Analysis of Striping Techniques in Robotic Storage Libraries}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {225--238}, publisher = {IEEE Computer Society Press}, URL = {http://www.computer.org/conferen/mss95/golub/golub.htm}, keyword = {mass storage, parallel I/O, pario-bib}, abstract = {In recent years advances in computational speed have been the main focus of research and development in high performance computing. In comparison, the improvement in I/O performance has been modest. Faster processing speeds have created a need for faster I/O as well as for the storage and retrieval of vast amounts of data. The technology needed to develop these mass storage systems exists today. Robotic storage libraries are vital components of such systems. However, they normally exhibit high latency and long transmission times. We analyze the performance of robotic storage libraries and study striping as a technique for improving response time. Although striping has been extensively studied in the content of disk arrays, the architectural differences between robotic storage libraries and arrays of disks suggest that a separate study of striping techniques in such libraries would be beneficial.} } @InProceedings{grossman:library, author = {R. Grossman and X. Qin and W. Xu and H. Hulen and T. Tyler}, title = {An Architecture for a Scalable High-Performance Digital Library}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {89--98}, publisher = {IEEE Computer Society Press}, URL = {http://www.computer.org/conferen/mss95/grossman/grossman.htm}, keyword = {mass storage, parallel I/O, pario-bib}, abstract = {Requirements for a high-performance, scalable digital library of multimedia data are presented together with a layered architecture for a system that addresses the requirements. The approach is to view digital data as persistent collections of complex objects and to use lightweight object management to manage this data. To scale as the amount of data increases, the object management component is layered over a storage management component. The storage management component supports hierarchical storage, third-party data transfer and parallel input-output. Several issues that arise from the interface between the storage management and object management components are discussed. The authors have developed a prototype of a digital library using this design. Two key components of the prototype are AIM Net and HPSS. AIM Net is a persistent object manager and is a product of Oak Park Research. HPSS is the High Performance Storage System, developed by a collaboration including IBM Government Systems and several national labs.} } @InProceedings{johnson:scx, author = {Steve Johnson and Steve Scott}, title = {A Supercomputer System Interconnect and Scalable {IOS}}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {357--367}, publisher = {IEEE Computer Society Press}, URL = {http://www.computer.org/conferen/mss95/johnson/johnson.htm}, keyword = {mass storage, I/O architecture, I/O interconnect, supercomputer, parallel I/O, pario-bib}, abstract = {The evolution of system architectures and system configurations has created the need for a new supercomputer system interconnect. Attributes required of the new interconnect include commonality among system and subsystem types, scalability, low latency, high bandwidth, a high level of resiliency, and flexibility. Cray Research Inc. is developing a new system channel to meet these interconnect requirements in future systems. The channel has a ring-based architecture, but can also function as a point-to-point link. It integrates control and data on a single, physical path while providing low latency and variance for control messages. Extensive features for client isolation, diagnostic capabilities, and fault tolerance have been incorporated into the design. The attributes and features of this channel are discussed along with implementation and protocol specifics.}, comment = {About the Cray Research SCX channel, capable of 1200 MB/s peak and 900 MB/s delivered throughput.} } @InProceedings{jones:mpi-io, author = {Terry Jones and Richard Mark and Jeanne Martin and John May and Elsie Pierce and Linda Stanberry}, title = {An {MPI-IO} Interface to {HPSS}}, booktitle = {Proceedings of the Fifth NASA Goddard conference on Mass Storage Systems}, year = {1996}, month = {September}, pages = {I:37--50}, keyword = {mass storage, parallel I/O, multiprocessor file system interface, pario-bib} } @InProceedings{kandemir:io-optimize, author = {Mahmut Kandemir and Alok Choudhary and Rajesh Bordawekar}, title = {{I/O} Optimizations for Compiling Out-of-Core programs on Distributed-Memory Machines}, booktitle = {Proceedings of the Eighth SIAM Conference on Parallel Processing for Scientific Computing}, year = {1997}, month = {March}, publisher = {Society for Industrial and Applied Mathematics}, note = {To appear. Extended Abstract.}, URL = {http://www.cacr.caltech.edu/~rajesh/siam97.ps}, keyword = {verify pages, parallel I/O, compiler, out-of-core, pario-bib}, abstract = {Since many of large scale computational problems usually deal with large quantities of data, optimizing the performance of I/O subsystems of massively parallel machines is an important challenge for system designers. We describe data access reorganization strategies for efficient compilation of out-of-core data-parallel programs on distributed memory machines. Our analytical approach and experimental results indicate that the optimizations introduced in this paper can reduce the amount of time spent in I/O by as much as an order of magnitude on both uniprocessors and multicomputers.} } @InProceedings{kandemir:optimize, author = {Mahmut Kandemir and Alok Choudhary and J. Ramanujam and Rajesh Bordawekar}, title = {Optimizing Out-of-Core Computations in Uniprocessors}, booktitle = {Proceedings of the Workshop on Interaction between Compilers and Computer Architectures}, year = {1997}, month = {February}, pages = {1--10}, URL = {http://www.cacr.caltech.edu/~rajesh/hpca.ps}, keyword = {verify publisher, parallel I/O, compiler, out-of-core, pario-bib}, abstract = {Programs accessing disk-resident arrays perform poorly in general due to excessive number of I/O calls and insufficient help from compilers. In this paper, in order to alleviate this problem, we propose a series of compiler optimizations. Both the analytical approach we use and the experimental results provide strong evidence that our method is very effective on uniprocessors for out-of-core nests whose data sizes far exceed the size of available memory.} } @InProceedings{kandemir:reorganize, author = {Mahmut Kandemir and Rajesh Bordawekar and Alok Choudhary}, title = {Data Access Reorganizations in Compiling Out-of-Core Data Parallel Programs on Distributed Memory Machines}, booktitle = {Proceedings of the Eleventh International Parallel Processing Symposium}, year = {1997}, month = {April}, URL = {http://www.cacr.caltech.edu/~rajesh/ipps97.ps}, keyword = {verify pages, compiler, data-parallel, out-of-core, parallel I/O, pario-bib}, abstract = {This paper describes optimization techniques for translating out-of-core programs written in a data parallel language to message passing node programs with explicit parallel I/O. We demonstrate that straightforward extension of in-core compilation techniques does not work well for out-of-core programs. We then describe how the compiler can optimize the code by (1) determining appropriate file layouts for out-of-core arrays, (2) permuting the loops in the nest(s) to allow efficient file access, and (3) partitioning the available node memory among references based on I/O cost estimation. Our experimental results indicate that these optimizations can reduce the amount of time spent in I/O by as much as an order of magnitude.} } @InProceedings{kandemir:tiling, author = {Mahmut Kandemir and Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {A Unified Tiling Approach for Out-of-Core Computations}, booktitle = {Sixth Workshop on Compilers for Parallel Computers}, year = {1996}, month = {December}, pages = {323--334}, publisher = {Forschungzentrum Julich GmbH}, address = {Aachen, Germany}, note = {Also available as Caltech Technical Report CACR 130}, URL = {http://www.cacr.caltech.edu/~rajesh/cpc.ps}, keyword = {parallel I/O, compiler, out-of-core, pario-bib}, abstract = {This paper describes a framework by which an out-of-core stencil program written in a data-parallel language can be translated into node programs in a distributed-memory message-passing machine with explicit I/O and communication. We focus on a technique called \emph{Data Space Tiling} to group data elements into slabs that can fit into memories of processors. Methods to choose \emph{legal} tile shapes under several constraints and deadlock-free scheduling of tiles are investigated. Our approach is \emph{unified} in the sense that it can be applied to both FORALL loops and the loops that involve flow-dependences.} } @InProceedings{kimbrel:prefetch, author = {Tracy Kimbrel and Pei Cao and Edward Felten and Anna Karlin and Kai Li}, title = {Integrating Parallel Prefetching and Caching}, booktitle = {Proceedings of the 1996 ACM Sigmetrics Conference on Measurement and Modeling of Computer Systems}, year = {1996}, month = {May}, pages = {262--263}, publisher = {ACM Press}, address = {Philadelphia, PA}, note = {Poster paper.}, keyword = {disk prefetching, parallel I/O, pario-bib}, comment = {They do a theoretical analysis of prefetching and caching in uniprocessor, single- and multi-disk situations, given that they know the complete access sequence; their measure is not hit rate but rather overall execution time. They found some algorithms that are close to optimal.} } @InProceedings{kimbrel:prefetch-trace, author = {Tracy Kimbrel and Andrew Tomkins and R. Hugo Patterson and Brian Bershad and Pei Cao and Edward Felten and Garth Gibson and Anna R. Karlin and Kai Li}, title = {A Trace-Driven Comparison of Algorithms for Parallel Prefetching and Caching}, booktitle = {Proceedings of the 1996 Symposium on Operating Systems Design and Implementation}, year = {1996}, month = {October}, pages = {19--34}, publisher = {USENIX Association}, URL = {http://www.usenix.org/publications/library/proceedings/osdi96/kimbrel.html}, keyword = {parallel I/O, tracing, prefetch, trace-driven simulation, pario-bib}, abstract = {High-performance I/O systems depend on prefetching and caching in order to deliver good performance to applications. These two techniques have generally been considered in isolation, even though there are significant interactions between them; a block prefetched too early reduces the effectiveness of the cache, while a block cached too long reduces the effectiveness of prefetching. In this paper we study the effects of several combined prefetching and caching strategies for systems with multiple disks. Using disk-accurate trace-driven simulation, we explore the performance characteristics of each of the algorithms in cases in which applications provide full advance knowledge of accesses using hints. Some of the strategies have been published with theoretical performance bounds, and some are components of systems that have been built. One is a new algorithm that combines the desirable characteristics of the others. We find that when performance is limited by I/O stalls, aggressive prefetching helps to alleviate the problem; that more conservative prefetching is appropriate when significant I/O stalls are not present; and that a single, simple strategy is capable of doing both.} } @InProceedings{kobler:eosdis, author = {Ben Kobler and John Berbert and Parris Caulk and P.~C. Hariharan}, title = {Architecture and Design of Storage and Data Management for the {NASA Earth Observing System Data and Information System (EOSDIS)}}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {65--76}, publisher = {IEEE Computer Society Press}, URL = {http://www.computer.org/conferen/mss95/kobler/kobler.htm}, keyword = {mass storage, I/O architecture, parallel I/O, pario-bib}, abstract = {Mission to Planet Earth (MTPE) is a long-term NASA research mission to study the processes leading to global climate change. The EOS Data and Information System (EOSDIS) is the component within MTPE that will provide the Earth science community with easy, affordable, and reliable access to Earth science data. EOSDIS is a distributed system, with major facilities at eight Distributed Active Archive Centers (DAACs) located throughout the United States. At the DAACs the Science Data Processing Segment (SDPS) will receive, process, archive, and manage all data. It is estimated that several hundred gigaflops of processing power will be required to process and archive the several terabytes of new data that will be generated and distributed daily. Thousands of science users and perhaps several hundred thousand nonscience users will access the system.} } @TechReport{kotz:app-pario, author = {David Kotz}, title = {Applications of Parallel {I/O}}, year = {1996}, month = {October}, number = {PCS-TR96-297}, institution = {Dept. of Computer Science, Dartmouth College}, note = {Release 1}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR96-297.ps.Z}, keyword = {parallel I/O application, file access patterns, pario-bib}, abstract = {Scientific applications are increasingly being implemented on massively parallel supercomputers. Many of these applications have intense I/O demands, as well as massive computational requirements. This paper is essentially an annotated bibliography of papers and other sources of information about scientific applications using parallel I/O. It will be updated periodically.} } @InProceedings{kotz:flexibility2, author = {David Kotz and Nils Nieuwejaar}, title = {Flexibility and Performance of Parallel File Systems}, booktitle = {Proceedings of the Third International Conference of the Austrian Center for Parallel Computation (ACPC)}, year = {1996}, month = {September}, series = {Lecture Notes in Computer Science}, volume = {1127}, pages = {1--11}, publisher = {Springer-Verlag}, earlier = {kotz:flexibility}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/kotz:flexibility2.ps.Z}, keyword = {parallel I/O, multiprocessor file system, dfk, pario-bib}, abstract = {As we gain experience with parallel file systems, it becomes increasingly clear that a single solution does not suit all applications. For example, it appears to be impossible to find a single appropriate interface, caching policy, file structure, or disk-management strategy. Furthermore, the proliferation of file-system interfaces and abstractions make applications difficult to port. \par We propose that the traditional functionality of parallel file systems be separated into two components: a fixed core that is standard on all platforms, encapsulating only primitive abstractions and interfaces, and a set of high-level libraries to provide a variety of abstractions and application-programmer interfaces (APIs). \par We present our current and next-generation file systems as examples of this structure. Their features, such as a three-dimensional file structure, strided read and write interfaces, and I/O-node programs, re specifically designed with the flexibility and performance necessary to support a wide range of applications.}, comment = {Nearly identical to kotz:flexibility. The only changes are the format, a shorter abstract, and updates to Section 7 and the references.} } @TechReport{kotz:tuning, author = {David Kotz}, title = {Tuning {STARFISH}}, year = {1996}, month = {October}, number = {PCS-TR96-296}, institution = {Dept. of Computer Science, Dartmouth College}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR96-296.ps.Z}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, abstract = {STARFISH is a parallel file-system simulator we built for our research into the concept of disk-directed I/O. In this report, we detail steps taken to tune the file systems supported by STARFISH, which include a traditional parallel file system (with caching) and a disk-directed I/O system. In particular, we now support two-phase I/O, use smarter disk scheduling, increased the maximum number of outstanding requests that a compute processor may make to each disk, and added gather/scatter block transfer. We also present results of the experiments driving the tuning effort.}, comment = {Reports on some new changes to the STARFISH simulator that implements traditional caching and disk-directed I/O. This is meant mainly as a companion to kotz:jdiskdir. See also kotz:jdiskdir, kotz:diskdir, kotz:expand.} } @InProceedings{kwong:distribution, author = {Peter Kwong and Shikaresh Majumdar}, title = {Study of Data Distribution Strategies for Parallel {I/O} Management}, booktitle = {Proceedings of the Third International Conference of the Austrian Center for Parallel Computation (ACPC)}, year = {1996}, month = {September}, series = {Lecture Notes in Computer Science}, volume = {1127}, pages = {12--23}, publisher = {Springer-Verlag}, keyword = {parallel I/O, pario-bib}, abstract = {Recent studies have demonstrated that a significant number of I/O operations are performed by a number of classes of different parallel applications. Appropriate I/O management strategies are required however for harnessing the power of parallel I/O. This paper focuses on two I/O management issues that affect system performance in multiprogrammed parallel environments. Characterization of I/O behavior of parallel applications in terms of four different models is discussed first, followed by an investigation of the performance of a number of different data distribution strategies. Using computer simulations this research shows that I/O characteristics of applications and data distribution have an important effect on system performance. Applications that can simultaneously do computation and I/O, plus strategies that can incorporate centralized I/O management are found to be beneficial for a multiprogrammed parallel environment.}, comment = {See majumdar:management.} } @Misc{large-scale-memories, key = {Algorithmica}, title = {Special issue on Large-Scale Memories}, year = {1994}, volume = {12}, number = {2}, howpublished = {Algorithmica} } @Article{lawlor:parity, author = {F.~D. Lawlor}, title = {Efficient mass storage parity recovery mechanism}, journal = {IBM Technical Disclosure Bulletin}, year = {1981}, month = {July}, volume = {24}, number = {2}, pages = {986--987}, keyword = {parallel I/O, disk array, RAID, pario-bib}, comment = {An early paper, perhaps the earliest, that describes the techniques that later became RAID. Lawlor notes how to use parity to recover data lost due to disk crash, as in RAID3, addresses the read-before-write problem by caching the old data block as well as the new data block, and shows how two-dimensional parity can protect against two or more failures.} } @InProceedings{lee:logical-disks, author = {Jang Sun Lee and Jungmin Kim and P. Bruce Berra and Sanjay Ranka}, title = {Logical Disks: User-Controllable {I/O} For Scientific Applications}, booktitle = {Proceedings of the 1996 IEEE Symposium on Parallel and Distributed Processing}, year = {1996}, month = {October}, pages = {340--347}, publisher = {IEEE Computer Society Press}, keyword = {logical disks, parallel I/O, pario-bib}, abstract = {In this paper we propose user-controllable I/O operations and explore the effects of them with some synthetic access patterns. The operations allow users to determine a file structure matching the access patterns, control the layout and distribution of data blocks on physical disks, and present various access patterns with a minimum number of I/O operations. The operations do not use a file pointer to access data as in typical file systems, which eliminates the overhead of managing the offset of the file, making it easy to share data and reducing the number of I/O operations.} } @InProceedings{lee:petal, author = {Edward K. Lee and Chandramohan A. Thekkath}, title = {Petal: Distributed Virtual Disks}, booktitle = {Proceedings of the Seventh International Conference on Architectural Support for Programming Languages and Operating Systems}, year = {1996}, month = {October}, pages = {84--92}, address = {Cambridge, MA}, URL = {http://www.research.digital.com/SRC/personal/Chandu_Thekkath/Papers/petal-asplos96.ps}, keyword = {parallel I/O, distributed file system, declustering, reliability, pario-bib}, comment = {They are trying to build a file server that is easier to manage than most of today's distributed file systems, because disks are cheap but management is expensive. They describe a distributed file server that spreads blocks of all files across many disks and many servers. They use chained declustering so that they can survive loss of server or disk. They dynamically balance load. They dynamically reconfigure when new virtual disks are created or new physical disks are added. They've built it all and are now going to look at possible file systems that can take advantage of the features of Petal.} } @InProceedings{lee:raidmodel, author = {Edward K. Lee and Randy H. Katz}, title = {An Analytic Performance Model of Disk Arrays}, booktitle = {Proceedings of the 1993 ACM Sigmetrics Conference on Measurement and Modeling of Computer Systems}, year = {1993}, pages = {98--109}, keyword = {disk array, parallel I/O, RAID, analytic model, pario-bib} } @InProceedings{lee:userio, author = {Jang Sun Lee and Sang-Gue Oh and Bruce P. Berra and Sanjay Ranka}, title = {User-Controllable {I/O} for Parallel Computers}, booktitle = {International Conference on Parallel and Distributed Processing Techniques and Applications (PDPTA~'96)}, year = {1996}, month = {August}, pages = {442--453}, keyword = {parallel I/O, pario-bib}, abstract = {This paper presents the design of UPIO, a software for user-controllable parallel input and output. UPIO is designed to maximize I/O performance for scientific applications on MIMD multicomputers. The most important features of UPIO are: It supports a domain-specific file model and a variety of application interfaces to present numerous access patterns. UPIO provides user-contollerable I/O operations that allow users to control data access, file structure, and data distribution. The domain-specific file model and user controllability give low I/O overhead and allow programmers to exploit the aggregate bandwidth of parallel disks.}, comment = {They describe an interface that seems to allow easier access for programmers that want to map matrices onto parallel files. The concepts are not well explained, so it's hard to really understand what is new and different. They make no explicit comparison with other advanced interfaces like that in Vesta or Galley. No performance results.} } @TechReport{li:recursive-tr, author = {Zhiyong Li and John H. Reif and Sandeep K. S. Gupta}, title = {Synthesizing Efficient Out-of-Core Programs for Block Recursive Algorithms using Block-Cyclic Data Distributions}, year = {1996}, month = {March}, number = {96-04}, institution = {Dept. of Computer Science, Duke University}, later = {li:recursive}, URL = {ftp://ftp.cs.duke.edu/pub/zli/papers/TR-96-04.ps.gz}, keyword = {parallel I/O, out-of-core algorithm, pario-bib}, abstract = {In this paper, we present a framework for synthesizing I/O efficient out-of-core programs for block recursive algorithms, such as the fast Fourier transform (FFT) and block matrix transposition algorithms. Our framework uses an algebraic representation which is based on tensor products and other matrix operations. The programs are optimized for the striped Vitter and Shriver's two-level memory model in which data can be distributed using various cyclic(B) distributions in contrast to the normally used {\it physical track} distribution cyclic(B_d), where B_d is the physical disk block size. \par We first introduce tensor bases to capture the semantics of block-cyclic data distributions of out-of-core data and also data access patterns to out-of-core data. We then present program generation techniques for tensor products and matrix transposition. We accurately represent the number of parallel I/O operations required for the synthesized programs for tensor products and matrix transposition as a function of tensor bases and data distributions. We introduce an algorithm to determine the data distribution which optimizes the performance of the synthesized programs. Further, we formalize the procedure of synthesizing efficient out-of-core programs for tensor product formulas with various block-cyclic distributions as a dynamic programming problem. \par We demonstrate the effectiveness of our approach through several examples. We show that the choice of an appropriate data distribution can reduce the number of passes to access out-of-core data by as large as eight times for a tensor product, and the dynamic programming approach can largely reduce the number of passes to access out-of-core data for the overall tensor product formulas.} } @InProceedings{ligon:pfs, author = {W. B. Ligon and R. B. Ross}, title = {Implementation and Performance of a Parallel File System for High Performance Distributed Applications}, booktitle = {Proceedings of the Fifth IEEE International Symposium on High Performance Distributed Computing}, year = {1996}, month = {August}, pages = {471--480}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, cluster computing, parallel file system, pario-bib}, abstract = {Dedicated cluster parallel computers (DCPCs) are emerging as low-cost high performance environments for many important applications in science and engineering. A significant class of applications that perform well on a DCPC are coarse-grain applications that involve large amounts of file I/O. Current research in parallel file systems for distributed systems is providing a mechanism for adapting these applications to the DCPC environment. We present the Parallel Virtual File System (PVFS), a system that provides disk striping across multiple nodes in a distributed parallel computer and file partitioning among tasks in a parallel program. PVFS is unique among similar systems in that it uses a stream-based approach that represents each file access with a single set of request parameters and decouples the number of network messages from details of the file striping and partitioning. PVFS also provides support for efficient collective file accesses and allows overlapping file partitions. We present results of early performance experiments that show PVFS achieves excellent speedups in accessing moderately sized file segments.} } @InProceedings{madhyasta:adaptive, author = {Tara M. Madhyasta and Christopher L. Elford and Daniel A. Reed}, title = {Optimizing Input/Output Using Adaptive File System Policies}, booktitle = {Proceedings of the Fifth NASA Goddard conference on Mass Storage Systems}, year = {1996}, month = {September}, pages = {II:493--514}, keyword = {multiprocessor file system, prefetching, caching, parallel I/O, multiprocessor file system interface, pario-bib} } @InProceedings{madhyastha:adaptive, author = {Tara M. Madhyastha and Daniel A. Reed}, title = {Intelligent, Adaptive File System Policy Selection}, booktitle = {Proceedings of the Sixth Symposium on the Frontiers of Massively Parallel Computation}, year = {1996}, month = {October}, pages = {172--179}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, pario-bib}, abstract = {Traditionally, maximizing input/output performance has required tailoring application input/output patterns to the idiosyncrasies of specific input/output systems. The authors show that one can achieve high application input/output performance via a low overhead input/output system that automatically recognizes file access patterns and adaptively modifies system policies to match application requirements. This approach reduces the application developer's input/output optimization effort by isolating input/output optimization decisions within a retargetable file system infrastructure. To validate these claims, they have built a lightweight file system policy testbed that uses a trained learning mechanism to recognize access patterns. The file system then uses these access pattern classifications to select appropriate caching strategies, dynamically adapting file system policies to changing input/output demands throughout application execution. The experimental data show dramatic speedups on both benchmarks and input/output intensive scientific applications.} } @InProceedings{majumdar:characterize, author = {S. Majumdar and Yiu Ming Leung}, title = {Characterization of applications with {I/O} for processor scheduling in multiprogrammed parallel systems}, booktitle = {Proceedings of the 1994 IEEE Symposium on Parallel and Distributed Processing}, year = {1994}, pages = {298--307}, publisher = {IEEE Computer Society Press}, keyword = {workload characterization, scheduling, parallel I/O, pario-bib}, abstract = {Most studies of processor scheduling in multiprogrammed parallel systems have ignored the I/O performed by applications. Recent studies have demonstrated that significant I/O operations are performed by a number of different classes of parallel applications. This paper focuses on some basic issues that underlie scheduling in multiprogrammed parallel environments running applications with I/O. Characterization of the I/O behavior of parallel applications is discussed first. Based on simulation models this research investigates the influence of these I/O characteristics on processor scheduling.} } @InProceedings{malluhi:pss, author = {Qutaibah Malluhi and William E. Johnston}, title = {Approaches for a Reliable High-Performance Distributed-Parallel Storage System}, booktitle = {Proceedings of the Fifth IEEE International Symposium on High Performance Distributed Computing}, year = {1996}, month = {August}, pages = {500--509}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, pario-bib}, abstract = {The paper studies different schemes to enhance the reliability, availability and security of a high performance distributed storage system. We have previously designed a distributed parallel storage system that employs the aggregate bandwidth of multiple data servers connected by a high speed wide area network to achieve scalability and high data throughput. The general approach of the paper employs erasure error correcting codes to add data redundancy that can be used to retrieve missing information caused by hardware, software, or human faults. The paper suggests techniques for reducing the communication and computation overhead incurred while retrieving missing data blocks form redundant information. These techniques include clustering, multidimensional coding, and the full two dimensional parity scheme.} } @InProceedings{matthews:hippi, author = {Kevin C. Matthews}, title = {Experiences Implementing a Shared File System on a {HIPPI} Disk Array}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {77--88}, publisher = {IEEE Computer Society Press}, URL = {http://www.computer.org/conferen/mss95/matthews/matthews.htm}, keyword = {mass storage, distributed file system, parallel I/O, pario-bib}, abstract = {Shared file systems which use a physically shared mass storage device have existed for many years, although not on UNIX based operating systems. This paper describes a shared file system (SFS) that was implemented first as a special project on the Gray Research Inc. (CRI) UNICOS operating system. A more general product was then built on top of this project using a HIPPI disk array for the shared mass storage. The design of SFS is outlined, as well as some performance experiences with the product. We describe how SFS interacts with the OSF distributed file service (DFS) and with the CRI data migration facility (DMF). We also describe possible development directions for the SFS product.} } @InProceedings{matthijs:framework, author = {F. Matthijs and Y. Berbers and P. Verbaeten}, title = {A flexible {I/O} framework for parallel and distributed systems}, booktitle = {Proceedings of the Fifth International Workshop on Object Orientation in Operating Systems}, year = {1995}, pages = {187--190}, publisher = {IEEE Computer Society Press}, keyword = {input-output programs, object-oriented, parallel systems; I/O performance, migration, dynamic load balancing, fault tolerance, parallel I/O, pario-bib}, abstract = {We propose a framework for I/O in parallel and distributed systems. The framework is highly customizable and extendible, and enables programmers to offer high level objects in their applications, without requiring them to struggle with the low level and sometimes complex details of high performance distributed I/O. Also, the framework exploits application specific information to improve I/O performance by allowing specialized programmers to customize the framework. Internally, we use indirection and granularity control to support migration, dynamic load balancing, fault tolerance, etc. for objects of the I/O system, including those representing application data.} } @InProceedings{menasce:mass, author = {Daniel Menasc\'e and Odysseas Ionnis Pentakalos and Yelena Yesha}, title = {An Analytic Model of Hierarchical Mass Storage Systems With Network-Attached Storage Devices}, booktitle = {Proceedings of the 1996 ACM Sigmetrics Conference on Measurement and Modeling of Computer Systems}, year = {1996}, month = {May}, pages = {180--189}, publisher = {ACM Press}, address = {Philadelphia, PA}, keyword = {network attached peripherals, analytic model, mass storage, parallel I/O, pario-bib} } @Article{moore:ddio, author = {Jason A. Moore and Michael J. Quinn}, title = {Enhancing Disk-Directed {I/O} for Fine-Grained Redistribution of File Data}, journal = {Parallel Computing}, year = {1997}, publisher = {North-Holland (Elsevier Scientific)}, note = {To appear}, keyword = {verify publication date and pages, parallel I/O, multiprocessor file system, interprocessor communication, pario-bib}, comment = {They propose several enhancements to disk-directed I/O (see kotz:diskdir) that aim to improve performance on fine-grained distributions, that is, where each block from the disk is broken into small pieces that are scattered among the compute processors. One enhancement combines multiple pieces, possibly from separate disk blocks, into a single message. Another is to use two-phase I/O (see delrosario:two-phase), but to use disk-directed I/O to read data from the disks into CP memories, efficiently, then permute. This latter technique is probably faster than normal two-phase I/O that uses a traditional file system, not disk-directed I/O, for the read.} } @TechReport{moore:stream-tr, author = {Jason A. Moore and Philip J. Hatcher and Michael J. Quinn}, title = {Stream*: Fast, Flexible, Data-parallel {I/O}}, year = {1994}, number = {94-80-13}, institution = {Oregon State University}, note = {Updated September 1995.}, later = {moore:stream}, URL = {http://www.cs.orst.edu/~moorej/streamstar.ps.Z}, keyword = {data parallel, parallel I/O, pario-bib}, abstract = {Although hardware supporting parallel file I/O has improved greatly since the introduction of first-generation parallel computers, the programming interface has not. Each vendor provides a different logical view of parallel files as well as nonportable operations for manipulating files. Neither do parallel languages provide standards for performing I/O. In this paper, we describe a view of parallel files for data-parallel languages, dubbed Stream*, in which each virtual processor writes to and reads from its own stream. In this scheme each virtual processor's I/O operations have the same familiar, unambiguous meaning as in a sequential C program. We demonstrate how I/O operations in Stream* can run as fast as those of vendor-specific parallel file systems on the operations most often encountered in data-parallel programs. We show how this system supports general virtual processor operations for debugging and elemental functions. Finally, we present empirical results from a prototype Stream* system running on a Meiko CS-2 multicomputer.}, comment = {See moore:stream; nearly identical. See also moore:detection. This paper gives a little bit earlier description of the Stream* idea than does moore:detection, but you'd be pretty much complete just reading moore:detection.} } @InProceedings{more:mtio, author = {Sachin More and Alok Choudhary and Ian Foster and Ming Q. Xu}, title = {{MTIO} A Multi-Threaded Parallel {I/O} System}, booktitle = {Proceedings of the Eleventh International Parallel Processing Symposium}, year = {1997}, month = {April}, URL = {http://www.ece.nwu.edu/~ssmore/ipps97.ps}, keyword = {verify pages, threads, parallel I/O, pario-bib}, abstract = {This paper presents the design and evaluation of a multi-threaded runtime library for parallel I/O. We extend the multi-threading concept to separate the compute and I/O tasks in two separate threads of control. Multi-threading in our design permits a) asynchronous I/O even if the underlying file system does not support asynchronous I/O; b) copy avoidance from the I/O thread to the compute thread by sharing address space; and c) a capability to perform collective I/O asynchronously without blocking the compute threads. Further, this paper presents techniques for collective I/O which maximize load balance and concurrency while reducing communication overhead in an integrated fashion. Performance results on IBM SP2 for various data distributions and access patterns are presented. The results show that there is a tradeoff between the amount of concurrency in I/O and the buffer size designated for I/O; and there is an optimal buffer size beyond which benefits of larger requests diminish due to large communication overheads.} } @InProceedings{mowry:prefetch, author = {Todd C. Mowry and Angela K. Demke and Orran Krieger}, title = {Automatic compiler-inserted I/O prefetching for out-of-core applications}, booktitle = {Proceedings of the 1996 Symposium on Operating Systems Design and Implementation}, year = {1996}, month = {October}, pages = {3--17}, publisher = {USENIX Association}, URL = {http://www.usenix.org/publications/library/proceedings/osdi96/mowry.html}, keyword = {compiler, prefetch, parallel I/O, pario-bib}, abstract = {Current operating systems offer poor performance when a numeric application's working set does not fit in main memory. As a result, programmers who wish to solve ``out-of-core'' problems efficiently are typically faced with the onerous task of rewriting an application to use explicit I/O operations (e.g., read/write). In this paper, we propose and evaluate a fully-automatic technique which liberates the programmer from this task, provides high performance, and requires only minimal changes to current operating systems. In our scheme, the compiler provides the crucial information on future access patterns without burdening the programmer, the operating system supports non-binding prefetch and release hints for managing I/O, and the operating system cooperates with a run-time layer to accelerate performance by adapting to dynamic behavior and minimizing prefetch overhead. This approach maintains the abstraction of unlimited virtual memory for the programmer, gives the compiler the flexibility to aggressively move prefetches back ahead of references, and gives the operating system the flexibility to arbitrate between the competing resource demands of multiple applications. We have implemented our scheme using the SUIF compiler and the Hurricane operating system. Our experimental results demonstrate that our fully-automatic scheme effectively hides the I/O latency in out-of-core versions of the entire NAS Parallel benchmark suite, thus resulting in speedups of roughly twofold for five of the eight applications, with two applications speeding up by threefold or more.}, comment = {Best Paper Award} } @Article{moyer:jcharacterize, author = {Steven A. Moyer and V.S. Sunderam}, title = {Characterizing Concurrency Control Performance for the {PIOUS} Parallel File System}, journal = {Journal of Parallel and Distributed Computing}, year = {1996}, month = {October}, volume = {38}, number = {1}, pages = {81--91}, earlier = {moyer:characterize}, keyword = {parallel I/O, multiprocessor file system, pario-bib} } @InProceedings{mueck:multikey, author = {T.~A. Mueck and J. Witzmann}, title = {Multikey Index Support for Tuple Sets on Parallel Mass Storage Systems}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {136--145}, URL = {http://www.computer.org/conferen/mss95/mueck/mueck.htm}, keyword = {parallel database, mass storage, parallel I/O, pario-bib}, abstract = {The development and evaluation of a tuple set manager (TSM) based on multikey index data structures is a main part of the PARABASE project at the University of Vienna. The TSM provides access to parallel mass storage systems using tuple sets instead of conventional files as the central data structure for application programs. A proof-of-concept prototype TSM is already implemented and operational on an iPSC/2. It supports tuple insert and delete operations as well as exact match, partial match, and range queries at system call level. Available results are from this prototype on the one hand and from various performance evaluation figures. The evaluation results demonstrate the performance gain achieved by the implementation of the tuple set management concept on a parallel mass storage system.} } @Article{myllymaki:buffering, author = {Jussi Myllymaki and Miron Livny}, title = {Efficient buffering for concurrent disk tape {I/O}}, journal = {Performance Evaluation: An International Journal}, year = {1996}, volume = {27/28}, pages = {453--471}, note = {Performance~'96}, keyword = {buffering, file caching, tertiary storage, tape robot, file migration, parallel I/O, pario-bib}, comment = {Ways to use secondary and tertiary storage in parallel, and buffering mechanisms for applications with concurrent I/O requirements.} } @InProceedings{nakajo:jump1, author = {Hironori Nakajo}, title = {A Simulation-based Evaluation of a Disk {I/O} Subsystem for a Massively Parallel Computer: {JUMP-1}}, booktitle = {Proceedings of the Sixteenth International Conference on Distributed Computer Systems}, year = {1996}, month = {May}, pages = {562--569}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, I/O architecture, pario-bib}, abstract = {JUMP-1 is a distributed shared-memory massively parallel computer and is composed of multiple clusters of interconnected network called RDT (Recursive Diagonal Torus). Each cluster in JUMP-1 consists of 4 element processors, secondary cache memories, and 2 MBP (Memory Based Processor) for high-speed synchronization and communication among clusters. The I/O subsystem is connected to a cluster via a high-speed serial link called STAFF-Link. The I/O buffer memory is mapped onto the JUMP-1 global shared-memory to permit each I/O access operation as memory access. In this paper we describe evaluation of the fundamental performance of the disk I/O subsystem using event-driven simulation, and estimated performance with a Video On Demand (VOD) application.} } @InProceedings{natarajan:clusterio, author = {Chita Natarajan and Ravishankar K. Iyer}, title = {Measurement and Simulation Based Performance Analysis of Parallel {I/O} in a High-Performance Cluster System}, booktitle = {Proceedings of the 1996 IEEE Symposium on Parallel and Distributed Processing}, year = {1996}, month = {October}, pages = {332--339}, publisher = {IEEE Computer Society Press}, keyword = {performance analysis, parallel I/O, pario-bib}, abstract = {This paper presents a measurement and simulation based study of parallel I/O in a high-performance cluster system: the Pittsburgh Supercomputing Center (PSC) DEC Alpha Supercluster. The measurements were used to characterize the performance bottlenecks and the throughput limits at the compute and I/O nodes, and to provide realistic input parameters to PioSim, a simulation environment we have developed to investigate parallel I/O performance issues in cluster systems. PioSim was used to obtain a detailed characterization of parallel I/O performance, in the high performance cluster system, for different regular access patterns and different system configurations. This paper also explores the use of local disks at the compute nodes for parallel I/O, and finds that the local disk architecture outperforms the traditional parallel I/O over remote I/O node disks architecture, even when as much as 68-75\% of the requests from each compute node goes to remote disks.} } @InProceedings{nieplocha:arrays, author = {Jarek Nieplocha and Ian Foster}, title = {Disk Resident Arrays: An Array-Oriented {I/O} Library for Out-Of-Core Computations}, booktitle = {Proceedings of the Sixth Symposium on the Frontiers of Massively Parallel Computation}, year = {1996}, month = {October}, pages = {196--204}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, pario-bib}, abstract = {In out-of-core computations, disk storage is treated as another level in the memory hierarchy, below cache, local memory, and (in a parallel computer) remote memories. However the tools used to manage this storage are typically quite different from those used to manage access to local and remote memory. This disparity complicates implementation of out-of-core algorithms and hinders portability. We describe a programming model that addresses this problem. This model allows parallel programs to use essentially the same mechanisms to manage the movement of data between any two adjacent levels in a hierarchical memory system. We take as our starting point the Global Arrays shared-memory model and library, which support a variety of operations on distributed arrays, including transfer between local and remote memories. We show how this model can be extended to support explicit transfer between global memory and secondary storage, and we define a Disk Resident Arrays Library that supports such transfers. We illustrate the utility of the resulting model with two applications, an out-of-core matrix multiplication and a large computational chemistry program. We also describe implementation techniques on several parallel computers and present experimental results that demonstrate that the Disk Resident Arrays model can be implemented very efficiently on parallel computers.} } @Article{nieuwejaar:jgalley, author = {Nils Nieuwejaar and David Kotz}, title = {The {Galley} Parallel File System}, journal = {Parallel Computing}, year = {1997}, publisher = {North-Holland (Elsevier Scientific)}, note = {To appear}, earlier = {nieuwejaar:jgalley-tr}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/nieuwejaar:jgalley.ps.Z}, keyword = {verify month and pages, parallel file system, parallel I/O, multiprocessor file system interface, pario-bib, dfk}, abstract = {Most current multiprocessor file systems are designed to use multiple disks in parallel, using the high aggregate bandwidth to meet the growing I/O requirements of parallel scientific applications. Many multiprocessor file systems provide applications with a conventional Unix-like interface, allowing the application to access multiple disks transparently. This interface conceals the parallelism within the file system, increasing the ease of programmability, but making it difficult or impossible for sophisticated programmers and libraries to use knowledge about their I/O needs to exploit that parallelism. In addition to providing an insufficient interface, most current multiprocessor file systems are optimized for a different workload than they are being asked to support. We introduce Galley, a new parallel file system that is intended to efficiently support realistic scientific multiprocessor workloads. We discuss Galley's file structure and application interface, as well as the performance advantages offered by that interface.}, comment = {A revised version of nieuwejaar:jgalley-tr, which is a combination of nieuwejaar:galley and nieuwejaar:galley-perf.} } @TechReport{nieuwejaar:jgalley-tr, author = {Nils Nieuwejaar and David Kotz}, title = {The {Galley} Parallel File System}, year = {1996}, month = {May}, number = {PCS-TR96-286}, institution = {Dept. of Computer Science, Dartmouth College}, note = {To appear in {\em Parallel Computing}.}, earlier = {nieuwejaar:galley}, later = {nieuwejaar:jgalley}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR96-286.ps.Z}, keyword = {parallel file system, parallel I/O, multiprocessor file system interface, pario-bib, dfk}, abstract = {Most current multiprocessor file systems are designed to use multiple disks in parallel, using the high aggregate bandwidth to meet the growing I/O requirements of parallel scientific applications. Many multiprocessor file systems provide applications with a conventional Unix-like interface, allowing the application to access multiple disks transparently. This interface conceals the parallelism within the file system, increasing the ease of programmability, but making it difficult or impossible for sophisticated programmers and libraries to use knowledge about their I/O needs to exploit that parallelism. In addition to providing an insufficient interface, most current multiprocessor file systems are optimized for a different workload than they are being asked to support. We introduce Galley, a new parallel file system that is intended to efficiently support realistic scientific multiprocessor workloads. We discuss Galley's file structure and application interface, as well as the performance advantages offered by that interface.} } @PhdThesis{nieuwejaar:thesis, author = {Nils A. Nieuwejaar}, title = {Galley: A New Parallel File System for Parallel Applications}, year = {1996}, month = {November}, school = {Dept. of Computer Science, Dartmouth College}, note = {Available as Dartmouth Technical Report PCS-TR96-300}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR96-300.ps.Z}, keyword = {parallel I/O, multiprocessor file system, file system workload characterization, file access patterns, file system interface, pario-bib}, abstract = {Most current multiprocessor file systems are designed to use multiple disks in parallel, using the high aggregate bandwidth to meet the growing I/O requirements of parallel scientific applications. Most multiprocessor file systems provide applications with a conventional Unix-like interface, allowing the application to access those multiple disks transparently. This interface conceals the parallelism within the file system, increasing the ease of programmability, but making it difficult or impossible for sophisticated application and library programmers to use knowledge about their I/O to exploit that parallelism. In addition to providing an insufficient interface, most current multiprocessor file systems are optimized for a different workload than they are being asked to support. In this work we examine current multiprocessor file systems, as well as how those file systems are used by scientific applications. Contrary to the expectations of the designers of current parallel file systems, the workloads on those systems are dominated by requests to read and write small pieces of data. Furthermore, rather than being accessed sequentially and contiguously, as in uniprocessor and supercomputer workloads, files in multiprocessor file systems are accessed in regular, structured, but non-contiguous patterns. Based on our observations of multiprocessor workloads, we have designed Galley, a new parallel file system that is intended to efficiently support realistic scientific multiprocessor workloads. In this work, we introduce Galley and discuss its design and implementation. We describe Galley's new three-dimensional file structure and discuss how that structure can be used by parallel applications to achieve higher performance. We introduce several new data-access interfaces, which allow applications to explicitly describe the regular access patterns we found to be common in parallel file system workloads. We show how these new interfaces allow parallel applications to achieve tremendous increases in I/O performance. Finally, we discuss how Galley's new file structure and data-access interfaces can be useful in practice.} } @Article{nieuwejaar:workload, author = {Nils Nieuwejaar and David Kotz and Apratim Purakayastha and Carla Schlatter Ellis and Michael Best}, title = {File-Access Characteristics of Parallel Scientific Workloads}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {1996}, month = {October}, volume = {7}, number = {10}, pages = {1075--1089}, publisher = {IEEE Computer Society Press}, earlier = {nieuwejaar:workload-tr}, URL = {http://www.computer.org/pubs/tpds/abs96.htm#1075td1096}, keyword = {parallel I/O, file system workload, workload characterization, file access pattern, multiprocessor file system, dfk, pario-bib}, abstract = {Phenomenal improvements in the computational performance of multiprocessors have not been matched by comparable gains in I/O system performance. This imbalance has resulted in I/O becoming a significant bottleneck for many scientific applications. One key to overcoming this bottleneck is improving the performance of multiprocessor file systems. \par The design of a high-performance multiprocessor file system requires a comprehensive understanding of the expected workload. Unfortunately, until recently, no general workload studies of multiprocessor file systems have been conducted. The goal of the CHARISMA project was to remedy this problem by characterizing the behavior of several production workloads, on different machines, at the level of individual reads and writes. The first set of results from the CHARISMA project describe the workloads observed on an Intel iPSC/860 and a Thinking Machines CM-5. This paper is intended to compare and contrast these two workloads for an understanding of their essential similarities and differences, isolating common trends and platform-dependent variances. Using this comparison, we are able to gain more insight into the general principles that should guide multiprocessor file-system design.}, comment = {See also kotz:workload, nieuwejaar:strided, ap:workload.} } @InProceedings{nodine:deterministic, author = {M. H. Nodine and J. S. Vitter}, title = {Deterministic Distribution Sort in Shared and Distributed Memory Multiprocessors}, booktitle = {Proceedings of the Fifth Symposium on Parallel Algorithms and Architectures}, year = {1993}, pages = {120--129}, address = {Velen, Germany}, URL = {ftp://cs.duke.edu/pub/jsv/Papers/NoV93.distr_sorting.ps.Z}, abstract = {We present an elegant deterministic load balancing strategy for distribution sort that is applicable to a wide variety of parallel disks and parallel memory hierarchies with both single and parallel processors. The simplest application of the strategy is an optimal deterministic algorithm for external sorting with multiple disks and parallel processors. In each input/output (I/O) operation, each of the $D \geq 1$ disks can simultaneously transfer a block of $B$ contiguous records. Our two measures of performance are the number of I/Os and the amount of work done by the CPU(s); our algorithm is simultaneously optimal for both measures. We also show how to sort deterministically in parallel memory hierarchies. When the processors are interconnected by any sort of a PRAM, our algorithms are optimal for all parallel memory hierarchies; when the interconnection network is a hypercube, our algorithms are either optimal or best-known.}, comment = {Short version of nodine:sort2 and nodine:sortdisk.} } @InProceedings{nurmi:atm, author = {Marc A. Nurmi and William E. Bejcek and Rod N. Gregoire and K. C. Liu and Mark D. Pohl}, title = {Automatic Management of {CPU} and {I/O} Bottlenecks in Distributed Applications on {ATM} Networks}, booktitle = {Proceedings of the Fifth IEEE International Symposium on High Performance Distributed Computing}, year = {1996}, month = {August}, pages = {481--489}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, ATM, parallel networking, pario-bib}, abstract = {Existing parallel programming environments for networks of workstations improve the performance of computationally intensive applications by using message passing or virtual shared memory to alleviate CPU bottlenecks. This paper describes an approach based on message passing that addresses both CPU and I/O bottlenecks for a specific class of distributed applications on ATM networks. ATM provides the bandwidth required to utilize multiple I/O channels in parallel. This paper also describes an environment based on distributed process management and centralized application management that implements the approach. The environment adds processes to a running application when necessary to alleviate CPU and I/O bottlenecks while managing process connections in a manner that is transparent to the application.} } @TechReport{ober:seismic, author = {Curtis Ober and Ron Oldfield and John VanDyke and David Womble}, title = {Seismic Imaging on Massively Parallel Computers}, year = {1996}, month = {April}, number = {SAND96-1112}, institution = {Sandia National Laboratories}, URL = {ftp://ftp.cs.sandia.gov/pub/papers/dewombl/seismic_imaging_mpp.ps.Z}, keyword = {multiprocessor application, scientific computing, seismic data processing, parallel I/O, pario-bib}, abstract = {Fast, accurate imaging of complex, oil-bearing geologies, such as overthrusts and salt domes, is the key to reducing the costs of domestic oil and gas exploration. Geophysicists say that the known oil reserves in the Gulf of Mexico could be significantly increased if accurate seismic imaging beneath salt domes was possible. A range of techniques exist for imaging these regions, but the highly accurate techniques involve the solution of the wave equation and are characterized by large data sets and large computational demands. Massively parallel computers can provide the computational power for these highly accurate imaging techniques. \par A brief introduction to seismic processing will be presented, and the implementation of a seismic-imaging code for distributed memory computers will be discussed. The portable code, Salvo, performs a wave-equation-based, 3-D, prestack, depth imaging and currently runs on the Intel Paragon, the Cray T3D and SGI Challenge series. It uses MPI for portability, and has sustained 22 Mflops/sec/proc (compiled FORTRAN) on the Intel Paragon.}, comment = {2 pages about their I/O scheme, mostly regarding a calculation of the optimal balance between compute nodes and I/O nodes to achieve perfect overlap.} } @InProceedings{park:interface, author = {Yoonho Park and Ridgway Scott and Stuart Sechrest}, title = {Virtual Memory Versus File Interfaces for Large, Memory-intensive Scientific Applications}, booktitle = {Proceedings of Supercomputing '96}, year = {1996}, month = {November}, publisher = {ACM Press and IEEE Computer Society Press}, note = {Also available as UH Department of Computer Science Research Report UH-CH-96-7}, URL = {http://www.hpc.uh.edu/cenju/pub/vm_revisit.ps}, keyword = {virtual memory, file interface, scientific applications, out-of-core, parallel I/O, pario-bib}, abstract = {Scientific applications often require some strategy for temporary data storage to do the largest possible simulations. The use of virtual memory for temporary data storage has received criticism because of performance problems. However, modern virtual memory found in recent operating systems such as Cenju-3/DE give application writers control over virtual memory policies. We demonstrate that custom virtual memory policies can dramatically reduce virtual memory overhead and allow applications to run out-of-core efficiently. We also demonstrate that the main advantage of virtual memory, namely programming simplicity, is not lost.}, comment = {Web and CDROM only.} } @InProceedings{salmon:nbody, author = {John Salmon and Michael Warren}, title = {Parallel Out-of-core Methods for {N}-body Simulation}, booktitle = {Proceedings of the Eighth SIAM Conference on Parallel Processing for Scientific Computing}, year = {1997}, URL = {http://www.cacr.caltech.edu/~johns/pubs/siam97/}, keyword = {verify pages and month, parallel I/O, out of core applications, scientific computing, pario-bib}, abstract = {Hierarchical treecodes have, to a large extent, converted the compute-bound N-body problem into a memory-bound problem. The large ratio of DRAM to disk pricing suggests use of out-of-core techniques to overcome memory capacity limitations. We will describe a parallel, out-of-core treecode library, targeted at machines with independent secondary storage associated with each processor. Borrowing the space-filling curve techniques from our in-core library, and ``manually'' paging, results in excellent spatial and temporal locality and very good performance.} } @TechReport{scheuermann:partition2, author = {Peter Scheuermann and Gerhard Weikum and Peter Zabback}, title = {Data Partitioning and Load Balancing in Parallel Disk Systems}, year = {1996}, month = {April}, number = {A/02/96}, institution = {Universit\"at Des Saarlandes}, address = {SaarBr\"ucken, Germany}, note = {Submitted to VLDB Journal.}, earlier = {scheuermann:partition}, keyword = {verify, parallel I/O, disk array, disk striping, load balance, pario-bib}, comment = {Updated version of scheuermann:partition.} } @Article{seamons:jpanda, author = {Kent E. Seamons and Marianne Winslett}, title = {Multidimensional Array {I/O} in {Panda~1.0}}, journal = {Proceedings of Supercomputing '96}, year = {1996}, volume = {10}, number = {2}, pages = {191--211}, earlier = {seamons:interface}, keyword = {parallel I/O, collective I/O, pario-bib} } @PhdThesis{seamons:thesis, author = {Kent E. Seamons}, title = {Panda: Fast Access to Persistent Arrays Using High Level Interfaces and Server Directed Input/Output}, year = {1996}, month = {May}, school = {University of Illinois at Urbana-Champaign}, URL = {http://bunny.cs.uiuc.edu/CADR/pubs/seamons-thesis.html}, keyword = {parallel I/O, persistent data, parallel computing, pario-bib}, abstract = {Multidimensional arrays are a fundamental data type in scientific computing and are used extensively across a broad range of applications. Often these arrays are persistent, i.e., they outlive the invocation of the program that created them. Portability and performance with respect to input and output (i/o) pose significant challenges to applications accessing large persistent arrays, especially in distributed-memory environments. A significant number of scientific applications perform conceptually simple array i/o operations, such as reading or writing a subarray, an entire array, or a list of arrays. However, the algorithms to perform these operations efficiently on a given platform may be complex and non-portable, and may require costly customizations to operating system software. \par This thesis presents a high-level interface for array i/o and three implementation architectures, embodied in the Panda (Persistence AND Arrays) array i/o library. The high-level interface contributes to application portability, by encapsulating unnecessary details and being easy to use. Performance results using Panda demonstrate that an i/o system can provide application programs with a high-level, portable, easy-to-use interface for array i/o without sacrificing performance or requiring custom system software; in fact, combining all these benefits may only be possible through a high-level interface due to the great freedom and flexibility a high-level interface provides for the underlying implementation. \par The Panda server-directed i/o architecture is a prime example of an efficient implementation of collective array i/o for closely synchronized applications in distributed-memory single-program multiple-data (SPMD) environments. A high-level interface is instrumental to the good performance of server-directed i/o, since it provides a global view of an upcoming collective i/o operation that Panda uses to plan sequential reads and writes. Performance results show that with server-directed i/o, Panda achieves throughputs close to the maximum AIX file system throughput on the i/o nodes of the IBM SP2 when reading and writing large multidimensional arrays.}, comment = {see also chen:panda, seamons:panda, seamons:compressed, seamons:interface, seamons:schemas, seamons:msio, seamons:jpanda} } @TechReport{shriver:api-tr, author = {Elizabeth A.~M. Shriver and Leonard F. Wisniewski}, title = {An {API} for Choreographing Data Accesses}, year = {1995}, month = {November}, number = {PCS-TR95-267}, institution = {Dept. of Computer Science, Dartmouth College}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR95-267.ps.Z}, keyword = {parallel I/O, multiprocessor file system interface, pario-bib}, abstract = {Current APIs for multiprocessor multi-disk file systems are not easy to use in developing out-of-core algorithms that choreograph parallel data accesses. Consequently, the efficiency of these algorithms is hard to achieve in practice. We address this deficiency by specifying an API that includes data-access primitives for data choreography. With our API, the programmer can easily access specific blocks from each disk in a single operation, thereby fully utilizing the parallelism of the underlying storage system. Our API supports the development of libraries of commonly-used higher-level routines such as matrix-matrix addition, matrix-matrix multiplication, and BMMC (bit-matrix-multiply/complement) permutations. We illustrate our API in implementations of these three high-level routines to demonstrate how easy it is to use.}, comment = {Also published as Courant Institute Tech Report 708.} } @InProceedings{si-woong:cluster, author = {Jang Si-Woong and Chung Ki-Dong and Sam Coleman}, title = {Design and Implementation of a Network-Wide Concurrent File System in a Workstation Cluster}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {239--245}, publisher = {IEEE Computer Society Press}, URL = {http://www.computer.org/conferen/mss95/woong/woong.htm}, keyword = {mass storage, cluster computing, distributed file system, parallel I/O, pario-bib}, abstract = {We estimate the performance of a network-wide concurrent file system implemented using conventional disks as disk arrays. Tests were carried out on both single system and network-wide environments. On single systems, a file was split across several disks to test the performance of file I/O operations. We concluded that performance was proportional to the number of disks, up to four, on a system with high computing power. Performance of a system with low computing power, however, did not increase, even with more than two disks. When we split a file across disks in a network-wide system called the Network-wide Concurrent File System (N-CFS), we found performance similar to or slightly higher than that of disk arrays on single systems. Since file access through N-CFS is transparent, this system enables traditional disks on single and networked systems to be used as disk arrays for I/O intensive jobs.} } @InProceedings{smirni:evolutionary, author = {Evgenia Smirni and Ruth A. Aydt and Andrew A. Chien and Daniel A. Reed}, title = {{I/O} Requirements of Scientific Applications: An Evolutionary View}, booktitle = {Proceedings of the Fifth IEEE International Symposium on High Performance Distributed Computing}, year = {1996}, pages = {49--59}, publisher = {IEEE Computer Society Press}, address = {Syracuse, NY}, URL = {http://www-pablo.cs.uiuc.edu/People/esmirni/docs/IOhpdc96.ps.Z}, keyword = {I/O, workload characterization, scientific computing, parallel I/O, pario-bib}, abstract = {The modest I/O configurations and file system limitations of many current high-performance systems preclude solution of problems with large I/O needs. I/O hardware and file system parallelism is the key to achieving high performance. We analyze the I/O behavior of several versions of two scientific applications on the Intel Paragon XP/S. The versions involve incremental application code enhancements across multiple releases of the operating system. Studying the evolution of I/O access patterns underscores the interplay between application access patterns and file system features. Our results show that both small and large request sizes are common, that at present, application developers must manually aggregate small requests to obtain high disk transfer rates, that concurrent file accesses are frequent, and that appropriate matching of the application access pattern and the file system access mode can significantly increase application I/O performance. Based on these results, we describe a set of file system design principles.}, comment = {They study two applications over several versions, using Pablo to capture the I/O activity. They thus watch as application developers improve the applications use of I/O modes and request sizes. Both applications move through three phases: initialization, computation (with out-of-core I/O or checkpointing I/O), and output. They found it necessary to tune the I/O request sizes to match the parameters of the I/O system. In the initial versions, the code used small read and write requests, which were (according to the developers) the "easiest and most natural implementation for their I/O." They restructured the I/O to make bigger requests, which better matched the capabilities of Intel PFS. They conclude that asynchronous and collective operations are imperative. They would like to see a file system that can adapt dynamically to adjust its policies to the apparent access patterns. Automatic request aggregation of some kind seems like a good idea; of course, that is one feature of a buffer cache.} } @InProceedings{srinilta:strategies, author = {Chutimet Srinilta and Divyesh Jadav and Alok Choudhary}, title = {Design and Evaluation of Data Storage and Retrieval Strategies in a Distributed Memory Continuous Media Server}, booktitle = {Proceedings of the Eleventh International Parallel Processing Symposium}, year = {1997}, month = {April}, URL = {http://www.ece.nwu.edu/~csrinilt/ipps97.ps}, keyword = {verify pages, threads, parallel I/O, pario-bib}, abstract = {High performance servers and high-speed networks will form the backbone of the infra-structure required for distributed multimedia information systems. Given that the goal of such a server is to support hundreds of interactive data streams simultaneously, various tradeoffs are possible with respect to the storage of data on secondary memory, and its retrieval therefrom. In this paper we identify and evaluate these tradeoffs. We evaluate the effect of varying the stripe factor and also the performance of batched retrieval of disk--resident data. We develop a methodology to predict the stream capacity of such a server. The evaluation is done for both uniform and skewed access patterns. Experimental results on the Intel Paragon computer are presented.} } @MastersThesis{subramaniam:msthesis, author = {Mahesh Subramaniam}, title = {Efficient Implementation of Server-Directed I/O}, year = {1996}, month = {June}, school = {Dept. of Computer Science, University of Illinois}, URL = {http://bunny.cs.uiuc.edu/CDR/pubs/mahesh-thesis.html}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, abstract = {Parallel computers are a cost effective approach to providing significant computational resources to a broad range of scientific and engineering applications. Due to the relatively lower performance of the I/O subsystems on these machines and due to the significant I/O requirements of these applications, the I/O performance can become a major bottleneck. Optimizing the I/O phase of these applications poses a significant challenge. A large number of these scientific and engineering applications perform simple operations on multidimensional arrays and providing an easy and efficient mechanism for implementing these operations is important. The Panda array I/O library provides simple high level interfaces to specify collective I/O operations on multidimensional arrays in a distributed memory single-program multiple-data (SPMD) environment. The high level information provided by the user through these interfaces allows the Panda array I/O library to produce an efficient implementation of the collective I/O request. The use of these high level interfaces also increases the portability of the application. \par This thesis presents an efficient and portable implementation of the Panda array I/O library. In this implementation, standard software components are used to build the I/O library to aid its portability. The implementation also provides a simple, flexible framework for the implementation and integration of the various collective I/O strategies. The server directed I/O and the reduced messages server directed I/O algorithms are implemented in the Panda array I/O library. This implementation supports the sharing of the I/O servers between multiple applications by extending the collective I/O strategies. Also, the implementation supports the use of part time I/O nodes where certain designated compute nodes act as the I/O servers during the I/O phase of the application. The performance of this implementation of the Panda array I/O library is measured on the IBM SP2 and the performance results show that for read and write operations, the collective I/O strategies used by the Panda array I/O library achieve throughputs close to the maximum throughputs provided by the underlying file system on each I/O node of the IBM SP2.} } @InProceedings{thakur:abstract, author = {Rajeev Thakur and William Gropp and Ewing Lusk}, title = {An Abstract-Device Interface for Implementing Portable Parallel-{I/O} Interfaces}, booktitle = {Proceedings of the Sixth Symposium on the Frontiers of Massively Parallel Computation}, year = {1996}, month = {October}, pages = {180--187}, earlier = {thakur:abstract-tr}, URL = {http://www.mcs.anl.gov/home/thakur/adio.ps}, keyword = {parallel I/O, multiprocessor file system interface, pario-bib}, abstract = {In this paper, we propose a strategy for implementing parallel-I/O interfaces portably and efficiently. We have defined an abstract-device interface for parallel I/O, called ADIO. Any parallel-I/O API can be implemented on multiple file systems by implementing the API portably on top of ADIO, and implementing only ADIO on different file systems. This approach simplifies the task of implementing an API and yet exploits the specific high-performance features of individual file systems. We have used ADIO to implement the Intel PFS interface and subsets of MPI-IO and IBM PIOFS interfaces on PFS, PIOFS, Unix, and NFS file systems. Our performance studies indicate that the overhead of using ADIO as an implementation strategy is very low.} } @TechReport{thakur:abstract-tr, author = {Rajeev Thakur and William Gropp and Ewing Lusk}, title = {An Abstract-Device Interface for Implementing Portable Parallel-{I/O} Interfaces}, year = {1996}, month = {May}, number = {MCS-P592-0596}, institution = {Argonne National Laboratory, Mathematics and Computer Science Division}, later = {thakur:abstract}, URL = {http://www.mcs.anl.gov/home/thakur/adio.ps}, keyword = {multiprocessor file system interface, parallel I/O, pario-bib}, comment = {They propose an intermediate interface that can serve as an implementation base for all parallel file-system APIs, and which can itself be implemented on top of all parallel file systems. This ``universal'' interface allows all apps to run on all file systems with no porting, and for people to experiment with different APIs.} } @TechReport{thakur:evaluation-tr, author = {Rajeev Thakur and William Gropp and Ewing Lusk}, title = {An Experimental Evaluation of the Parallel {I/O} Systems of the {IBM~SP} and {Intel Paragon} Using a Production Application}, year = {1996}, month = {February}, number = {MCS-P569--0296}, institution = {Argonne National Laboratory}, later = {thakur:evaluation}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, abstract = {This paper presents the results of an experimental evaluation of the parallel I/O systems of the IBM SP and Intel Paragon. For the evaluation, we used a full, three-dimensional application code that is in production use for studying the nonlinear evolution of Jeans instability in self-gravitating gaseous clouds. The application performs I/O by using library routines that we developed and optimized separately for parallel I/O on the SP and Paragon. The I/O routines perform two-phase I/O and use the PIOFS file system on the SP and PFS on the Paragon. We studied the I/O performance for two different sizes of the application. We found that for the small case, I/O was faster on the SP, whereas for the large case, I/O took almost the same time on both systems. Communication required for I/O was faster on the Paragon in both cases. The highest read bandwidth obtained was 48 Mbytes/sec. and the highest write bandwidth obtained was 31.6 Mbytes/sec., both on the SP.}, comment = {This version no longer on the web.} } @Article{thakur:jext2phase, author = {Rajeev Thakur and Alok Choudhary}, title = {{An Extended Two-Phase Method for Accessing Sections of Out-of-Core Arrays}}, journal = {Scientific Programming}, year = {1996}, month = {Winter}, volume = {5}, number = {4}, pages = {301--317}, earlier = {thakur:ext2phase2}, URL = {http://www.mcs.anl.gov/home/thakur/ext2ph.ps}, keyword = {parallel I/O, pario-bib}, abstract = {A number of applications on parallel computers deal with very large data sets that cannot fit in main memory. In such applications, data must be stored in files on disks and fetched into memory during program execution. Parallel programs with large out-of-core arrays stored in files must read/write smaller sections of the arrays from/to files. In this article, we describe a method for accessing sections of out-of-core arrays efficiently. Our method, the extended two-phase method, uses collective I/O: Processors cooperate to combine several I/O requests into fewer larger granularity requests, reorder requests so that the file is accessed in proper sequence, and eliminate simultaneous I/O requests for the same data. In addition, the I/O workload is divided among processors dynamically, depending on the access requests. We present performance results obtained from two real out-of-core parallel applications---matrix multiplication and a Laplace's equation solver---and several synthetic access patterns, all on the Intel Touchstone Delta. These results indicate that the extended two-phase method significantly outperformed a direct (noncollective) method for accessing out-of-core array sections.} } @Article{thakur:jpassion, author = {Rajeev Thakur and Alok Choudhary and Rajesh Bordawekar and Sachin More and Sivaramakrishna Kuditipudi}, title = {Passion: Optimized {I/O} for Parallel Applications}, journal = {IEEE Computer}, year = {1996}, month = {June}, volume = {29}, number = {6}, pages = {70--78}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, pario-bib}, comment = {See thakur:passion, choudhary:passion.} } @TechReport{thomas:panda, author = {Joel T. Thomas}, title = {The {Panda} Array {I/O} Library on the {Galley} Parallel File System}, year = {1996}, month = {June}, number = {PCS-TR96-288}, institution = {Dept. of Computer Science, Dartmouth College}, note = {Senior Honors Thesis.}, URL = {http://www.cs.dartmouth.edu/reports/abstracts/TR96-288/}, keyword = {multiprocessor file system, parallel I/O, pario-bib}, abstract = {The Panda Array I/O library, created at the University of Illinois, Urbana-Champaign, was built especially to address the needs of high-performance scientific applications. I/O has been one of the most frustrating bottlenecks to high performance for quite some time, and the Panda project is an attempt to ameliorate this problem while still providing the user with a simple, high-level interface. The Galley File System, with its hierarchical structure of files and strided requests, is another attempt at addressing the performance problem. My project was to redesign the Panda Array library for use on the Galley file system. This project involved porting Panda's three main functions: a checkpoint function for writing a large array periodically for 'safekeeping,' a restart function that would allow a checkpointed file to be read back in, and finally a timestep function that would allow the user to write a group of large arrays several times in a sequence. Panda supports several different distributions in both the compute-node memories and I/O-node disks. \par We have found that the Galley File System provides a good environment on which to build high-performance libraries, and that the mesh of Panda and Galley was a successful combination.}, comment = {See seamons:thesis.} } @InProceedings{vengroff:efficient2, author = {Darren Erik Vengroff and Jeffrey Scott Vitter}, title = {{I/O}-Efficient Scientific Computation Using {TPIE}}, booktitle = {Proceedings of the Fifth NASA Goddard conference on Mass Storage Systems}, year = {1996}, month = {September}, pages = {II:553--570}, keyword = {parallel I/O algorithms, run-time support, parallel I/O, multiprocessor file system interface, pario-bib}, comment = {Same as vengroff:efficient?} } @InProceedings{venugopal:delays, author = {C.~R. Venugopal and S.~S.~S.~P. Rao}, title = {Impact of Delays in Parallel {I/O} System: An Empirical Study}, booktitle = {Proceedings of the Fifth IEEE International Symposium on High Performance Distributed Computing}, year = {1996}, month = {August}, pages = {490-499}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, pario-bib}, abstract = {Performance of I/O intensive applications on a multiprocessor system depends mostly on the variety of disk access delays encountered in the I/O system. Over the years, the improvement in disk performance has taken place more slowly than the corresponding increase in processor speeds. It is therefore necessary to model I/O delays and evaluate performance benefits of moving an application to a better multiprocessor system. We perform such an analysis by measuring I/O delays for a synthesized application that uses a parallel distributed file system. The aim of this study is to evaluate the performance benefits of better disks in a multiprocessor system. We report on how the I/O performance would be affected if an application were to run on a system which would have better disks and communication links. In this study, we show a substantial improvement in the performance of an I/O system with better disks and communication links with respect to the existing system.} } @InProceedings{watson:hpss, author = {Richard W. Watson and Robert A. Coyne}, title = {The Parallel {I/O} Architecture of the High-Performance Storage System ({HPSS})}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {27--44}, publisher = {IEEE Computer Society Press}, URL = {http://www.computer.org/conferen/mss95/watson/watson.htm}, keyword = {mass storage, parallel I/O, multiprocessor file system interface, pario-bib}, abstract = {Datasets up to terabyte size and petabyte total capacities have created a serious imbalance between I/O and storage-system performance and system functionality. One promising approach is the use of parallel data-transfer techniques for client access to storage, peripheral-to-peripheral transfers, and remote file transfers. This paper describes the parallel I/O architecture and mechanisms, parallel transport protocol (PTP), parallel FTP, and parallel client application programming interface (API) used by the high-performance storage system (HPSS). Parallel storage integration issues with a local parallel file system are also discussed.} }