BibTeX bibliography file: Parallel I/O Ongoing Edition Last updated: 1999.Jul.24 This edition supercedes my older bibliographies. This bibliography is available on the WWW at http://www.cs.dartmouth.edu/pario/bib/ and by ftp at ftp://ftp.cs.dartmouth.edu/pub/pario/pario.bib Both of which are easily reached by the Parallel-I/O Archive at http://www.cs.dartmouth.edu/pario/ This bibliography covers parallel I/O, with a significant emphasis on file systems rather than, say, network or graphics I/O. This includes architecture, operating systems, some algorithms, some databases, and some workload characterization. Because of the expanding nature of this field, I cannot cover everything, and this bibliography is admittedly spotty on topics like disk arrays, parallel databases, and parallel networking. The entries are alphabetized by cite key. The emphasis is on including everything I have, rather than selecting a few key articles of interest. Thus, you probably don't want (or need) to read everything here. There are many repeated entries, in the sense that a paper is often published first as a TR, then in a conference, then in a journal. The "earlier" and "later" tags tie together versions of a paper. Except where noted, all comments are mine, and any opinions expressed there are mine only. In some cases I am simply restating the opinion or result obtained by the paper's authors, and thus even I might disagree with the statement. I keep most editorial comments to a minimum. Please send any additions or corrections (new abstracts and URLs would be great!) to me at this address (I have to hide the address in a string so bibtex won't complain about this header): @string{parallel-io-bib = "parallel-io-bib@dartmouth.edu"} Indeed, if you want to get updates to the bibliography (released once per week), subscribe to that mailing list by sending a message to majordomo (see address below) whose BODY says subscribe parallel-io-bib The address for majordomo is below; I have to hide it in a string so bibtex won't complain about this header. @string{majordomo = "majordomo@dartmouth.edu"} You may use the bibliography as you please except for publishing it as a whole, since the compilation is mine. Please leave this header on the collection; BibTeX won't mind. David Kotz Associate Professor Department of Computer Science Dartmouth College 6211 Sudikoff Laboratory Hanover, NH 03755-3510 USA URL: http://www.cs.dartmouth.edu/~dfk/ 603-646-1439 @string {email = "dfk@cs.dartmouth.edu"} % have to hide this from bibtex % BibTeX bibliography file @InProceedings{abali:ibm370, author = {B\"{u}lent Abali and Bruce D. Gavril and Richard W. Hadsell and Linh Lam and Brion Shimamoto}, title = {{Many/370: A} Parallel Computer Prototype for {I/O} Intensive Applications}, booktitle = {Proceedings of the Sixth Annual Distributed-Memory Computer Conference}, year = {1991}, pages = {728--730}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, comment = {Describes a parallel IBM/370, where they attach several small 370s to a switch, and several disks to each 370. Not much in the way of striping.} } @Book{abello:dimacs, title = {External Memory Algorithms and Visualization}, booktitle = {External Memory Algorithms and Visualization}, editor = {James Abello and Jeffrey Scott Vitter}, year = {1999}, series = {DIMACS Series in Discrete Mathematics and Theoretical Computer Science}, publisher = {American Mathematical Society Press}, address = {Providence, RI}, keyword = {verify publication and editors, parallel I/O, out-of-core algorithm, pario-bib}, comment = {See also the component papers vitter:survey, arge:lower, crauser:segment, grossi:crosstrees, toledo:survey. Not clear to what extent these papers are about *parallel* I/O.} } @InProceedings{abello:graph, author = {James Abello and Adam L. Buchsbaum and Jeffrey R. Westbrook}, title = {A Functional Approach to External Memory Graph Algorithms}, booktitle = {Proceedings of the 6th Annual European Symposium on Algorithms}, year = {1998}, month = {August}, series = {Lecture Notes in Computer Science}, volume = {1461}, pages = {332--343}, publisher = {Springer-Verlag}, address = {Venice, Italy}, keyword = {verify publication authors and pages, out-of-core algorithm, graph, pario-bib} } @Article{abu-safah:speedup, author = {Walid Abu-Safah and Harlan Husmann and David Kuck}, title = {On {Input/Output} Speed-up in Tightly-coupled Multiprocessors}, journal = {IEEE Transactions on Computers}, year = {1986}, pages = {520--530}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, I/O, pario-bib}, comment = {Derives formulas for the speedup with and without I/O considered and with parallel software and hardware format conversion. Considering I/O gives a more optimistic view of the speedup of a program {\em assuming} that the parallel version can use its I/O bandwidth as effectively as the serial processor. Concludes that, for a given number of processors, increasing the I/O bandwidth is the most effective way to speed up the program (over the format conversion improvements).} } @InProceedings{acharya:tuning, author = {Anurag Acharya and Mustafa Uysal and Robert Bennett and Assaf Mendelson and Michael Beynon and Jeffrey K. Hollingsworth and Joel Saltz and Alan Sussman}, title = {Tuning the Performance of {I/O} Intensive Parallel Applications}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {15--27}, publisher = {ACM Press}, address = {Philadelphia}, keyword = {parallel I/O, filesystem workload, parallel application, pario-bib}, abstract = {Getting good I/O performance from parallel programs is a critical problem for many application domains. In this paper, we report our experience tuning the I/O performance of four application programs from the areas of satellite-data processing and linear algebra. After tuning, three of the four applications achieve application-level I/O rates of over 100 MB/s on 16 processors. The total volume of I/O required by the programs ranged from about 75 MB to over 200 GB. We report the lessons learned in achieving high I/O performance from these applications, including the need for code restructuring, local disks on every node and knowledge of future I/O requests. We also report our experience on achieving high performance on peer-to-peer configurations. Finally, we comment on the necessity of complex I/O interfaces like collective I/O and strided requests to achieve high performance.} } @Article{aggarwal:sorting, author = {Alok Aggarwal and Jeffrey Scott Vitter}, title = {The Input/Output Complexity of Sorting and Related Problems}, journal = {Communications of the ACM}, year = {1988}, month = {September}, volume = {31}, number = {9}, pages = {1116--1127}, keyword = {parallel I/O, sorting, pario-bib}, abstract = {We provide tight upper and lower bounds, up to a constant factor, for the number of inputs and outputs~(I/Os) between internal memory and secondary storage required for five sorting-related problems: sorting, the fast Fourier transform (FFT), permutation networks, permuting, and matrix transposition. The bounds hold both in the worst case and in the average case, and in several situations the constant factors match. Secondary storage is modeled as a magnetic disk capable of transfering $P$~blocks each containing $B$~records in a single time unit; the records in each block must be input from or output to $B$~contiguous locations on the disk. We give two optimal algorithms for the problems, which are variants of merge sorting and distribution sorting. In particular we show for $P=1$ that the standard merge sorting algorithm is an optimal external sorting method, up to a constant factor in the number of~I/Os. Our sorting algorithms use the same number of~I/Os as does the permutation phase of key sorting, except when the internal memory size is extremely small, thus affirming the popular adage that key sorting is not faster. We also give a simpler and more direct derivation of Hong and Kung's lower bound for the FFT for the special case $B = P = O(1)$.}, comment = {Good comments on typical external sorts, and how big they are. Focuses on parallelism at the disk. They give tight theoretical bounds on the number of I/O's required to do external sorting and other problems (FFTs, matrix transpose, etc.). If $x$ is the number of blocks in the file and $y$ is the number of blocks that fit in memory, then the number of I/Os needed grows as $\Theta (x \log x / \log y)$. If parallel transfers of $p$ blocks are allowed, speedup linear in $p$ is obtained.} } @InProceedings{agrawal:asynch, author = {Gagan Agrawal and Anurag Acharya and Joel Saltz}, title = {An Interprocedural Framework for Placement of Asynchronous {I/O} Operations}, booktitle = {Proceedings of the 10th ACM International Conference on Supercomputing}, year = {1996}, month = {May}, pages = {358--365}, publisher = {ACM Press}, address = {Philadelphia, PA}, keyword = {compiler, I/O, pario-bib}, comment = {Not really about parallel applications or parallel I/O, but I think it may be of interest to that community. They propose a compiler framework for a compiler to insert asynchronous I/O operations (start I/O, finish I/O), to satisfy the dependency constraints of the program.} } @InProceedings{alvarez:failures, author = {Guillermo A. Alvarez and Walter A. Burkhard and Flaviu Cristian}, title = {Tolerating Multiple Failures in {RAID} Architectures with Optimal Storage and Uniform Declustering}, booktitle = {Proceedings of the 24th Annual International Symposium on Computer Architecture}, year = {1997}, month = {May}, pages = {62}, publisher = {IEEE Computer Society Press}, URL = {ftp://cs.ucsd.edu/pub/galvarez/papers/datumisca.ps.Z}, keyword = {fault tolerance, RAID, disk array, parallel I/O, pario-bib} } @InProceedings{alverson:tera, author = {Robert Alverson and David Callahan and Daniel Cummings and Brian Koblenz and Allan Porterfield and Burton Smith}, title = {The {Tera} Computer System}, booktitle = {Proceedings of the 1990 ACM International Conference on Supercomputing}, year = {1990}, pages = {1--6}, keyword = {parallel architecture, MIMD, NUMA, pario-bib}, comment = {Interesting architecture. 3-d mesh of pipelined packet-switch nodes, e.g., 16x16x16 is 4096 nodes, with 256 procs, 512 memory units, 256 I/O cache units, and 256 I/O processors attached. 2816 remaining nodes are just switching nodes. Each processor is 64-bit custom chip with up to 128 simultaneous threads in execution. It alternates between ready threads, with a deep pipeline. Inter-instruction dependencies explicitly encoded by the compiler, stalling those threads until the appropriate time. Each thread has a complete set of registers! Memory units have 4-bit tags on each word, for full/empty and trap bits. Shared memory across the network: ``The Tera ISP-level architecture is UMA, even though the PMS-level architecture is NUMA. Put another way, the memory looks a single cycle away to the compiler writer.'' -- Burton Smith. See also tera:brochure.} } @InProceedings{ap:enwrich, author = {Apratim Purakayastha and Carla Schlatter Ellis and David Kotz}, title = {{ENWRICH:} A Compute-Processor Write Caching Scheme for Parallel File Systems}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {55--68}, publisher = {ACM Press}, address = {Philadelphia}, earlier = {ap:enwrich-tr}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/ap:enwrich.ps.Z}, keyword = {parallel file system, parallel I/O, caching, pario-bib, dfk}, abstract = {Many parallel scientific applications need high-performance I/O. Unfortunately, end-to-end parallel-I/O performance has not been able to keep up with substantial improvements in parallel-I/O hardware because of poor parallel file-system software. Many radical changes, both at the interface level and the implementation level, have recently been proposed. One such proposed interface is {\em collective I/O}, which allows parallel jobs to request transfer of large contiguous objects in a single request, thereby preserving useful semantic information that would otherwise be lost if the transfer were expressed as per-processor non-contiguous requests. Kotz has proposed {\em disk-directed I/O} as an efficient implementation technique for collective-I/O operations, where the compute processors make a single collective data-transfer request, and the I/O processors thereafter take full control of the actual data transfer, exploiting their detailed knowledge of the disk-layout to attain substantially improved performance. \par Recent parallel file-system usage studies show that writes to write-only files are a dominant part of the workload. Therefore, optimizing writes could have a significant impact on overall performance. In this paper, we propose ENWRICH, a compute-processor write-caching scheme for write-only files in parallel file systems. ENWRICH combines low-overhead write caching at the compute processors with high performance disk-directed I/O at the I/O processors to achieve both low latency and high bandwidth. This combination facilitates the use of the powerful disk-directed I/O technique independent of any particular choice of interface. By collecting writes over many files and applications, ENWRICH lets the I/O processors optimize disk I/O over a large pool of requests. We evaluate our design via simulated implementation and show that ENWRICH achieves high performance for various configurations and workloads.} } @TechReport{ap:enwrich-tr, author = {Apratim Purakayastha and Carla Schlatter Ellis and David Kotz}, title = {{ENWRICH:} A Compute-Processor Write Caching Scheme for Parallel File Systems}, year = {1995}, month = {October}, number = {CS-1995-22}, institution = {Dept. of Computer Science, Duke University}, later = {ap:enwrich}, URL = {ftp://ftp.cs.duke.edu/dist/techreport/1995/1995-22.ps.gz}, keyword = {parallel file system, parallel I/O, caching, pario-bib, dfk}, abstract = {Many parallel scientific applications need high-performance I/O. Unfortunately, end-to-end parallel-I/O performance has not been able to keep up with substantial improvements in parallel-I/O hardware because of poor parallel file-system software. Many radical changes, both at the interface level and the implementation level, have recently been proposed. One such proposed interface is {\em collective I/O}, which allows parallel jobs to request transfer of large contiguous objects in a single request, thereby preserving useful semantic information that would otherwise be lost if the transfer were expressed as per-processor non-contiguous requests. Kotz has proposed {\em disk-directed I/O} as an efficient implementation technique for collective-I/O operations, where the compute processors make a single collective data-transfer request, and the I/O processors thereafter take full control of the actual data transfer, exploiting their detailed knowledge of the disk-layout to attain substantially improved performance. \par Recent parallel file-system usage studies show that writes to write-only files are a dominant part of the workload. Therefore, optimizing writes could have a significant impact on overall performance. In this paper, we propose ENWRICH, a compute-processor write-caching scheme for write-only files in parallel file systems. ENWRICH combines low-overhead write caching at the compute processors with high performance disk-directed I/O at the I/O processors to achieve both low latency and high bandwidth. This combination facilitates the use of the powerful disk-directed I/O technique independent of any particular choice of interface. By collecting writes over many files and applications, ENWRICH lets the I/O processors optimize disk I/O over a large pool of requests. We evaluate our design via simulated implementation and show that ENWRICH achieves high performance for various configurations and workloads.} } @PhdThesis{ap:thesis, author = {Apratim Purakayastha}, title = {Characterizing and Optimizing Parallel File Systems}, year = {1996}, month = {June}, school = {Dept. of Computer Science, Duke University}, address = {Durham, NC}, note = {Also available as technical report CS-1996-10}, URL = {ftp://ftp.cs.duke.edu/dist/techreport/1996/1996-10.ps.gz}, keyword = {parallel I/O, multiprocessor file system, file access patterns, workload characterization, file caching, disk-directed I/O, pario-bib}, abstract = {High-performance parallel file systems are needed to satisfy tremendous I/O requirements of parallel scientific applications. The design of such parallel file systems depends on a comprehensive understanding of the expected workload, but so far there have been very few usage studies of multiprocessor file systems. In the first part of this dissertation, we attempt to fill this void by measuring a real file-system workload on a production parallel machine, namely the CM-5 at the National Center for Supercomputing Applications. We collect information about nearly every individual I/O request from the mix of jobs running on the machine. Analysis of the traces leads to various recommendations for design of future parallel file systems. Our usage study showed that writes to write-only files are a dominant part of the workload. Therefore, optimizing writes could have a significant impact on overall performance. In the second part of this dissertation, we propose ENWRICH, a compute-processor write-caching scheme for write-only files in parallel file systems. Within its framework, ENWRICH uses a recently proposed high performance implementation of collective I/O operations called disk-directed I/O, but it eliminates a number of limitations of disk-directed I/O. ENWRICH combines low-overhead write caching at the compute processors with high performance disk-directed I/O at the I/O processors to achieve both low latency and high bandwidth. This combination facilitates the use of the powerful disk-directed I/O technique independent of any particular choice of interface, and without the requirement for mapping libraries at the I/O processors. By collecting writes over many files and applications, ENWRICH lets the I/O processors optimize disk I/O over a large pool of requests. We evaluate our design of ENWRICH using simulated implementation and extensive experimentation. We show that ENWRICH achieves high performance for various configurations and workloads. We pinpoint the reasons for ENWRICH`s failure to perform well for certain workloads, and suggest possible enhancements. Finally, we discuss the nuances of implementing ENWRICH on a real platform and speculate about possible adaptations of ENWRICH for emerging multiprocessing platforms.}, comment = {See also ap:enwrich, ap:workload, and nieuwejaar:workload} } @InProceedings{ap:workload, author = {Apratim Purakayastha and Carla Schlatter Ellis and David Kotz and Nils Nieuwejaar and Michael Best}, title = {Characterizing Parallel File-Access Patterns on a Large-Scale Multiprocessor}, booktitle = {Proceedings of the Ninth International Parallel Processing Symposium}, year = {1995}, month = {April}, pages = {165--172}, publisher = {IEEE Computer Society Press}, earlier = {ap:workload-tr}, later = {nieuwejaar:workload-tr}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/ap:workload.ps.Z}, keyword = {parallel I/O, file access pattern, multiprocessor file system, file system workload, dfk, pario-bib}, abstract = {High-performance parallel file systems are needed to satisfy tremendous I/O requirements of parallel scientific applications. The design of such high-performance parallel file systems depends on a comprehensive understanding of the expected workload, but so far there have been very few usage studies of multiprocessor file systems. This paper is part of the CHARISMA project, which intends to fill this void by measuring real file-system workloads on various production parallel machines. In particular, here we present results from the CM-5 at the National Center for Supercomputing Applications. Our results are unique because we collect information about nearly every individual I/O request from the mix of jobs running on the machine. Analysis of the traces leads to various recommendations for parallel file-system design.}, comment = {See also kotz:workload, nieuwejaar:strided.} } @TechReport{ap:workload-tr, author = {Apratim Purakayastha and Carla Schlatter Ellis and David Kotz and Nils Nieuwejaar and Michael Best}, title = {Characterizing Parallel File-Access Patterns on a Large-Scale Multiprocessor}, year = {1994}, month = {October}, number = {CS-1994-33}, institution = {Dept. of Computer Science, Duke University}, later = {ap:workload}, URL = {ftp://ftp.cs.duke.edu/pub/dist/techreport/1994/1994-33.ps.Z}, keyword = {parallel I/O, file access pattern, multiprocessor file system, file system workload, dfk, pario-bib}, abstract = {Rapid increases in the computational speeds of multiprocessors have not been matched by corresponding performance enhancements in the I/O subsystem. To satisfy the large and growing I/O requirements of some parallel scientific applications, we need parallel file systems that can provide high-bandwidth and high-volume data transfer between the I/O subsystem and thousands of processors. \par Design of such high-performance parallel file systems depends on a thorough grasp of the expected workload. So far there have been no comprehensive usage studies of multiprocessor file systems. Our CHARISMA project intends to fill this void. The first results from our study involve an iPSC/860 at NASA Ames. This paper presents results from a different platform, the CM-5 at the National Center for Supercomputing Applications. The CHARISMA studies are unique because we collect information about every individual read and write request and about the entire mix of applications running on the machines. \par The results of our trace analysis lead to recommendations for parallel file system design. First, the file system should support efficient concurrent access to many files, and I/O requests from many jobs under varying load condit ions. Second, it must efficiently manage large files kept open for long periods. Third, it should expect to see small requests, predominantly sequential access patterns, application-wide synchronous access, no concurrent file-sharing between jobs, appreciable byte and block sharing between processes within jobs, and strong interprocess locality. Finally, the trace data suggest that node-level write caches and collective I/O request interfaces may be useful in certain environments.}, comment = {See also kotz:workload, nieuwejaar:strided.} } @TechReport{arendt:genome, author = {James W. Arendt}, title = {Parallel Genome Sequence Comparison Using a Concurrent File System}, year = {1991}, number = {UIUCDCS-R-91-1674}, institution = {University of Illinois at Urbana-Champaign}, keyword = {parallel file system, parallel I/O, Intel iPSC/2, pario-bib}, comment = {Studies the performance of Intel CFS. Uses an application that reads in a huge file of records, each a genome sequence, and compares each sequence against a given sequence. Looks at cache performance, message latency, cost of prefetches and directory reads, and throughput. He tries one-disk, one-proc transfer rates. Because of contention with the directory server on one of the two I/O nodes, it was faster to put all of the file on the other I/O node. Striping is good for multiple readers. Best access pattern was interleaved, not segmented or separate files, because it avoided disk seeks. Perhaps the files are stored contiguously? Can get good speedup by reading the sequences in big integral record sizes, from CFS, using a load-balancing scheduled by the host. Contention for directory blocks -- through single-node directory server.} } @InCollection{arge:GIS, author = {Lars Arge}, title = {External-memory algorithms with applications in {GIS}}, booktitle = {Algorithmic foundations of geographic information systems}, editor = {Marc van Kreveld and Jurg Nievergelt and Thomas Roos and Peter Widmayer}, year = {1997}, series = {Lecture Notes in Computer Science}, volume = {1340}, pages = {213--254}, publisher = {Springer-Verlag}, URL = {http://www.cs.duke.edu/~large/Papers/gisnotes.ps}, keyword = {out-of-core algorithm, geographic information system, GIS, pario-bib}, abstract = {The paper presents a survey of the basic paradigms for designing efficient external-memory algorithms and especially for designing external-memory algorithms for computational geometry problems with applications in GIS. As the area of external-memory algorithms is relatively young the paper focuses on fundamental external-memory design techniques more than on algorithms for specific GIS problems. The presentation is survey-like with a more detailed discussion of the most important techniques and algorithms.}, comment = {not parallel? but mentions some parallel disk stuff.} } @Article{arge:jsegments, author = {Lars Arge and Darren Erik Vengroff and Jeffrey Scott Vitter}, title = {External-Memory Algorithms for Processing Line Segments in Geographic Information Systems}, journal = {Algorithmica}, year = {1998}, note = {To appear}, earlier = {arge:segments}, URL = {ftp://cs.duke.edu/pub/jsv/Papers/AVV97.SegmentGIS.ps.gz}, keyword = {verify, out-of-core algorithm, computational geometry, pario-bib}, abstract = {We present a set of algorithms designed to solve large-scale geometric problems involving collections of line segments in the plane. Geographical information systems (GIS) handle large amounts of spatial data, and at some level the data is often manipulated as collections of line segments. NASA's EOS project is an example of a GIS that is expected to store and manipulate petabytes (thousands of terabytes, or millions of gigabytes) of data! In the design of algorithms for this type of large-scale application, it is essential to consider the problem of minimizing I/O communication, which is the bottleneck. \par In this paper we develop efficient new external-memory algorithms for a number of important problems involving line segments in the plane, including trapezoid decomposition, batched planar point location, triangulation, red-blue line segment intersection reporting, and general line segment intersection reporting. In GIS systems, the first three problems are useful for rendering and modeling, and the latter two are frequently used for overlaying maps and extracting information from them. To solve these problems, we combine and modify in novel ways several of the previously known techniques for designing efficient algorithms for external memory. We also develop a powerful new technique that can be regarded as a practical external memory version of fractional cascading. Except for the batched planar point location problem, no algorithms specifically designed for external memory were previously known for these problems. Our algorithms for triangulation and line segment intersection partially answer previously posed open problems, while the batched planar point location algorithm improves on the previously known solution, which applied only to monotone decompositions. Our algorithm for the red-blue line segment intersection problem is provably optimal.}, comment = {Special issue on cartography and geographic information systems.} } @InCollection{arge:lower, author = {Lars Arge and Peter Bro Miltersen}, title = {On showing lower bounds for external-memory computational geometry problems}, booktitle = {External Memory Algorithms and Visualization}, editor = {James Abello and Jeffrey Scott Vitter}, crossref = {abello:dimacs}, year = {1999}, series = {DIMACS Series in Discrete Mathematics and Theoretical Computer Science}, publisher = {American Mathematical Society Press}, address = {Providence, RI}, keyword = {verify publication authors and pages, out-of-core algorithm, computational geometry, pario-bib}, comment = {See also the component papers vitter:survey, arge:lower, crauser:segment, grossi:crosstrees, toledo:survey. Not clear to what extent these papers are about *parallel* I/O.} } @InProceedings{arge:segments, author = {Lars Arge and Darren Erik Vengroff and Jeffrey Scott Vitter}, title = {External-Memory Algorithms for Processing Line Segments in Geographic Information Systems}, booktitle = {Proceedings of the Third European Symposium on Algorithms}, year = {1995}, month = {September}, series = {Lecture Notes in Computer Science}, volume = {979}, pages = {295--310}, publisher = {Springer-Verlag}, address = {Corfu, Greece}, later = {arge:jsegments}, URL = {ftp://cs.duke.edu/pub/jsv/Papers/AVV95.SegmentGIS.ps.Z}, keyword = {out-of-core algorithm, computational geometry, pario-bib}, abstract = {In the design of algorithms for large-scale applications it is essential to consider the problem of minimizing I/O communication. Geographical information systems (GIS) are good examples of such large-scale applications as they frequently handle huge amounts of spatial data. In this paper we develop efficient new external-memory algorithms for a number of important problems involving line segments in the plane, including trapezoid decomposition, batched planar point location, triangulation, red-blue line segment intersection reporting, and general line segment intersection reporting. In GIS systems, the first three problems are useful for rendering and modeling, and the latter two are frequently used for overlaying maps and extracting information from them.}, comment = {Does deal with parallel disks, though not in great detail.} } @InProceedings{arge:sorting, author = {Lars Arge and Paolo Ferragina and Roberto Grossi and Jeffrey Scott Vitter}, title = {Sequence sorting in secondary storage}, booktitle = {Proceedings of Compression and Complexity of Sequences}, year = {1998}, month = {June}, pages = {329--346}, publisher = {IEEE Computer Society Press}, address = {Salerno, Italy}, keyword = {out-of-core algorithm, sorting algorithm, pario-bib}, abstract = {We investigate the I/O complexity of the problem of sorting sequences (or strings of characters) in external memory, which is a fundamental component of many large-scale text applications. In the standard unit-cost RAM comparison model, the complexity of sorting K strings of total length N is Theta (K log/sub 2/ K+N). By analogy, in the external memory (or I/O) model, where the internal memory has size M and the block transfer size is B, it would be natural to guess that the I/O complexity of sorting sequences is Theta ((K/B)log/sub M/B/(K/B)+(N/B)), but the known algorithms do not come even close to achieving this bound. Our results show, somewhat counterintuitively, that the I/O complexity of string sorting depends upon the length of the strings relative to the block size. We first consider a simple comparison I/O model, where the strings are not allowed to be broken into their individual characters, and we show that the I/O complexity of string sorting in this model is Theta ((N/sub 1//B)log/sub M/B/(N/sub 1//B)+K/sub 2/+(N/B)), where N/sub 1/ is the total length of all strings shorter than B and K/sub 2/ is the number of strings longer than B. We then consider two more general I/O comparison models in which string breaking is allowed. We obtain improved algorithms and in several cases lower bounds that match their I/O bounds. Finally, we develop more practical algorithms outside the comparison model.}, comment = {This paper is really the same paper as arge:sorting-strings.} } @InProceedings{arge:sorting-strings, author = {Lars Arge and Paolo Ferragina and Roberto Grossi and Jeffrey Scott Vitter}, title = {On sorting strings in external memory}, booktitle = {Proceedings of the 29th Annual ACM Symposium on Theory of Computing}, year = {1997}, month = {May}, pages = {540--548}, publisher = {ACM Press}, address = {El Paso}, URL = {file://ftp.cs.duke.edu/pub/jsv/Papers/AFG97.stringsorting.ps.gz}, keyword = {out-of-core algorithm, sorting, parallel I/O, pario-bib}, abstract = {In this paper we address for the first time the I/O complexity of the problem of sorting strings in external memory, which is a fundamental component of many large-scale text applications. In the standard unit-cost RAM comparison model, the complexity of sorting K strings of total length N is theta(K log K + N). By analogy, in the external memory (or I/O) model, where the internal memory has size M and the block transfer size is B, it would be natural to guess that the I/O complexity of sorting strings is theta(K/B log_(M/B) (K/B) + N/B), but the known algorithms do not come even close to achieving this bound. Our results show, somewhat counterintuitively, that the I/O complexity of string sorting depends upon the length of the strings relative to the block size. We first consider a simple comparison I/O model, where one is not allowed to break the strings into their characters, and we show that the I/O complexity of string sorting in this model is theta(N_1/B log_(M/B) (N_1/B) + K_2 log_(M/B) K_2 + N/B), where N_1 is the total length of all strings shorter than B and K_2 is the number of strings longer than B. We then consider two more general I/O comparison models in which string breaking is allowed. We obtain improved algorithms and in several cases lower bounds that match their I/O bounds. Finally, we develop more practical algorithms without assuming the comparison model.}, comment = {Not parallel? But mentions some parallel disk stuff.} } @InProceedings{armen:disk-model, author = {Chris Armen}, title = {Bounds on the Separation of Two Parallel Disk Models}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {122--127}, publisher = {ACM Press}, address = {Philadelphia}, keyword = {parallel I/O, theory, parallel I/O algorithm, pario-bib}, abstract = {The single-disk, D-head model of parallel I/O was introduced by Agarwal and Vitter to analyze algorithms for problem instances that are too large to fit in primary memory. Subsequently Vitter and Shriver proposed a more realistic model in which the disk space is partitioned into D disks, with a single head per disk. To date, each problem for which there is a known optimal algorithm for both models has the same asymptotic bounds on both models. Therefore, it has been unknown whether the models are equivalent or whether the single-disk model is strictly more powerful. \par In this paper we provide evidence that the single-disk model is strictly more powerful. We prove a lower bound on any general simulation of the single-disk model on the multi-disk model and establish randomized and deterministic upper bounds. Let $N$ be the problem size and let $T$ be the number of parallel I/Os required by a program on the single-disk model. Then any simulation of this program on the multi-disk model will require $\Omega\left(T \frac{\log(N/D)}{\log \log(N/D)}\right)$ parallel I/Os. This lower bound holds even if replication is allowed in the multi-disk model. We also show an $O\left(\frac{\log D}{\log \log D}\right)$ randomized upper bound and an $O\left(\log D (\log \log D)^2\right)$ deterministic upper bound. These results exploit an interesting analogy between the disk models and the PRAM and DCM models of parallel computation.} } @InProceedings{arpaci-dusseau:river, author = {Remzi H. Arpaci-Dusseau and Eric Anderson and Noah Treuhaft and David E. Culler and Joseph M. Hellerstein and David Patterson and Kathy Yelick}, title = {Cluster {I/O} with {River}: Making the Fast Case Common}, booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1999}, month = {May}, pages = {10--22}, publisher = {ACM Press}, address = {Atlanta, GA}, URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Remzi.ps}, keyword = {cluster computing, parallel I/O, pario-bib}, abstract = {We introduce River, a data-flow programming environment and I/O substrate for clusters of computers. River is designed to provide maximum performance in the common case--- even in the face of non-uniformities in hardware, software, and workload. River is based on two simple design features: a high-performance distributed queue,and a storage redundancy mechanism called graduated declustering.We have implemented a number of data-intensive applications on River, which validate our design with near-ideal performance in a variety of non-uniform performance scenarios.} } @InProceedings{arunachalam:prefetch, author = {Meenakshi Arunachalam and Alok Choudhary and Brad Rullman}, title = {A Prefetching Prototype for the Parallel File System on the {Paragon}}, booktitle = {Proceedings of the 1995 ACM SIGMETRICS Conference on Measurement and Modeling of Computer Systems}, year = {1995}, month = {May}, pages = {321--323}, note = {Extended Abstract}, keyword = {parallel I/O, prefetching, parallel file system, pario-bib}, comment = {A related paper is arunachalam:prefetch2.} } @InProceedings{arunachalam:prefetch2, author = {Meenkashi Arunachalam and Alok Choudhary and Brad Rullman}, title = {Implementation and evaluation of prefetching in the {Intel Paragon Parallel File System}}, booktitle = {Proceedings of the Tenth International Parallel Processing Symposium}, year = {1996}, month = {April}, pages = {554--559}, URL = {http://www.ece.nwu.edu/~meena/papers/ipps.ps}, keyword = {parallel I/O, prefetching, multiprocessor file system, pario-bib}, abstract = {The significant difference between the speeds of the I/O system (e.g., disks) and compute processors in parallel systems creates a bottleneck that lowers the performance of an application that does a considerable amount of disk accesses. A major portion of the compute processors' time is wasted on waiting for I/O to complete. This problem can be addressed to a certain extent, if the necessary data can be fetched from the disk before the I/O call to the disk is issued. Fetching data ahead of time, known as prefetching in a multiprocessor environment depends a great deal on the application's access pattern. The subject of this paper is implementation and performance evaluation of a prefetching prototype in a production parallel file system on the Intel Paragon. Specifically, this paper presents a) design and implementation of a prefetching strategy in the parallel file system and b) performance measurements and evaluation of the file system with and without prefetching. The prototype is designed at the operating system level for the PFS. It is implemented in the PFS subsystem of the Intel Paragon Operating System. It is observed that in many cases prefetching provides considerable performance improvements. In some other cases no improvements or some performance degradation is observed due to the overheads incurred in prefetching.}, comment = {See arunachalam:prefetch.} } @InProceedings{asbury:fortranio, author = {Raymond K. Asbury and David S. Scott}, title = {{FORTRAN} {I/O} on the {iPSC/2}: Is there read after write?}, booktitle = {Proceedings of the Fourth Conference on Hypercube Concurrent Computers and Applications}, year = {1989}, pages = {129--132}, publisher = {Golden Gate Enterprises, Los Altos, CA}, address = {Monterey, CA}, keyword = {parallel I/O, hypercube, Intel iPSC/2, file access pattern, pario-bib} } @InProceedings{asthana:active, author = {Abhaya Asthana and Mark Cravatts and Paul Krzyzanowski}, title = {An Experimental Active Memory Based {I/O} Subsystem}, booktitle = {Proceedings of the IPPS~'94 Workshop on Input/Output in Parallel Computer Systems}, year = {1994}, month = {April}, pages = {73--84}, organization = {AT\&T Bell Labs}, note = {Also appeared in Computer Architecture News 22(4)}, later = {asthana:active-book}, keyword = {parallel I/O, architecture, pario-bib}, comment = {They describe an I/O subsystem based on an ``active memory'' called SWIM (Structured Wafer-based Intelligent Memory). SWIM chips are RAM chips with some built-in processing. The idea is that these tiny processors can manipulate the data in the chip at full speed, without dealing with memory bus or off-chip costs. Further, the chips can work in parallel. They demonstrate how they've used this to build a national phone database server, a high-performance IP router, and a call-screening agent.} } @InCollection{asthana:active-book, author = {Abhaya Asthana and Mark Cravatts and Paul Krzyzanowski}, title = {An Experimental Memory-based {I/O} Subsystem}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {17}, editor = {Ravi Jain and John Werth and James C. Browne}, crossref = {iopads-book}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {373--390}, publisher = {Kluwer Academic Publishers}, earlier = {asthana:active}, keyword = {parallel I/O architecture, pario-bib}, abstract = {We describe an I/O subsystem based on an active memory named SWIM (Structured Wafer-based Intelligent Memory) designed for efficient storage and manipulation of data structures. The key architectural idea in SWIM is to associate some processing logic with each memory chip that allows it to perform data manipulation operations locally and to communicate with a disk or a communication line through a backend port. The processing logic is specially designed to perform operations such as pointer dereferencing, memory indirection, searching and bounds checking efficiently. The I/O subsystem is built using an interconnected ensemble of such memory logic pairs. A complex processing task can now be distributed between a large number of small memory processors each doing a sub-task, while still retaining a common locus of control in the host CPU for higher level administrative and provisioning functions. We argue that active memory based processing enables more powerful, scalable and robust designs for storage and communications subsystems, that can support emerging network services, multimedia workstations and wireless PCS systems. A complete parallel hardware and software system constructed using an array of SWIM elements has been operational for over a year. We present results from application of SWIM to three network functions: a national phone database server, a high performance IP router, and a call screening agent.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @InProceedings{avalani:channels, author = {Bhavan Avalani and Alok Choudhary and Ian Foster and Rakesh Kirshnaiyer}, title = {Integrating Task and Data Parallelism Using Parallel {I/O} Techniques}, booktitle = {Proceedings of the International Workshop on Parallel Processing}, year = {1994}, month = {December}, address = {Bangalore, India}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/task_data.ps.Z}, keyword = {parallel I/O, pario-bib}, comment = {They describe using the techniques of delrosario and debenedictis (although without mentioning them) to provide for channels (parallel pipes) between independent data-parallel tasks. The technique really is the same as in debenedictus and delrosario, although they extend it a bit to allow multiple "files" within a channel (why not use multiple channels)? Also, they depend on the program to read and write synchronization variables to control access to the flow of data through the channel. While this may provide good performance in some cases, why not have support for automatic flow control? The system can detect when a portion of the channel is written, and release readers waiting on that portion of the channel (if any). The paper is a bit confusing in its use of the word "file", which seems to be used to mean different things at different points. Also, they seem to use an arbitrary distribution for the "file", which may or may not be the same as one of those used by the two endpoints.} } @TechReport{bagrodia:sio-character, author = {Rajive Bagrodia and Andrew Chien and Yarson Hsu and Daniel Reed}, title = {Input/Output: Instrumentation, Characterization, Modeling and Management Policy}, year = {1994}, number = {CCSF-41}, institution = {Scalable I/O Initiative}, address = {Caltech Concurrent Supercomputing Facilities, Caltech}, URL = {http://www.ccsf.caltech.edu/SIO/SIO_perf.ps}, keyword = {parallel I/O, pario-bib, prefetching, caching, multiprocessor file system, file access pattern}, comment = {Basically there are two parts to this paper. First, they will instrument applications, Intel PFS, and IBM Vesta, to trace I/O-related activity. Then they'll use Pablo to analyze and characterize. They plan to trace some events in detail, and the rest with histogram counters. Second, they plan to develop caching and prefetching policies and to analyze those with simulation, analysis, and implementation. They note that IBM and Intel are developing parallel I/O architecture simulators. See also poole:sio-survey, choudhary:sio-language, bershad:sio-os.} } @InProceedings{baird:disa, author = {R. Baird and S. Karamooz and H. Vazire}, title = {Distributed Information Storage Architecture}, booktitle = {Proceedings of the Twelfth IEEE Symposium on Mass Storage Systems}, year = {1993}, pages = {145--155}, keyword = {parallel I/O, distributed file system, mass storage, pario-bib}, comment = {Architecture for distributed information storage. Integrates file systems, databases, etc. Single system image, lots of support for administration. O-O model, with storage device objects, logical device objects, volume objects, and file objects. Methods for each type of object, including administrative methods.} } @InProceedings{baldwin:hyperfs, author = {C. H. Baldwin and W. C. Nestlerode}, title = {A Large Scale File Processing Application on a Hypercube}, booktitle = {Proceedings of the Fifth Annual Distributed-Memory Computer Conference}, year = {1990}, pages = {1400-1404}, keyword = {multiprocessor file system, file access pattern, parallel I/O, hypercube, pario-bib}, comment = {Census-data processing on an nCUBE/10 at USC. Their program uses an interleaved pattern, which is like my lfp or gw with multi-record records (i.e., the application does its own blocking). Shifted to asynchronous I/O to do OBL manually. Better results if they did more computation per I/O (of course).} } @TechReport{baptist:fft, author = {Lauren M. Baptist}, title = {Two Algorithms for Performing Multidimensional, Multiprocessor, Out-of-Core {FFTs}}, year = {1999}, month = {June}, number = {PCS-TR99-350}, institution = {Dept. of Computer Science, Dartmouth College}, address = {Hanover, NH}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR99-350.ps.Z}, keyword = {parallel I/O, out of core, FFT, parallel algorithm, scientific computing, pario-bib}, abstract = {We show two algorithms for computing multidimensional Fast Fourier Transforms (FFTs) on a multiprocessor system with distributed memory when problem sizes are so large that the data do not fit in the memory of the entire system. Instead, data reside on a parallel disk system and are brought into memory in sections. We use the Parallel Disk Model for implementation and analysis. \par The first method is a straightforward out-of-core variant of a well-known method for in-core, multidimensional FFTs. It performs 1-dimensional FFT computations on each dimension in turn. This method is easy to generalize to any number of dimensions, and it also readily permits the individual dimensions to be of any sizes that are integer powers of~2. The key step is an out-of-core transpose operation that places the data along each dimension into contiguous positions on the parallel disk system so that the data for the 1-dimensional FFTs are contiguous.\par The second method is an adaptation of another well-known method for in-core, multidimensional FFTs. This method computes all dimensions simultaneously. It is more difficult to generalize to arbitrary radices and number of dimensions in this method than in the first method. Our present implementation is therefore limited to two dimensions of equal size, that are again integer powers of~2. \par We present I/O complexity analyses for both methods as well as empirical results for a DEC~2100 server and an SGI Origin~2000, each of which has a parallel disk system. Our results indicate that the methods are comparable in speed in two-dimensions.}, comment = {Undergraduate Honors Thesis. Advisor: Tom Cormen.} } @TechReport{barak:hfs, author = {Amnon Barak and Bernard A. Galler and Yaron Farber}, title = {A Holographic File System for a Multicomputer with Many Disk Nodes}, year = {1988}, month = {May}, number = {88-6}, institution = {Dept. of Computer Science, Hebrew University of Jerusalem}, keyword = {parallel I/O, hashing, reliability, disk mirroring, pario-bib}, comment = {Describes a file system for a distributed system that scatters records of each file over many disks using hash functions. The hash function is known by all processors, so no one processor must be up to access the file. Any portion of the file whose disknode is available may be accessed. Shadow nodes are used to take over for nodes that go down, saving the info for later use by the proper node. Intended to easily parallelize read/write accesses and global file operations, and to increase file availability.} } @InProceedings{barve:competitive2, author = {Rakesh Barve and Mahesh Kallahalla and Peter J. Varman and Jeffrey Scott Vitter}, title = {Competitive Parallel Disk Prefetching and Buffer Management}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {47--56}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {disk prefetching, file caching, parallel I/O, pario-bib}, abstract = {We provide a competitive analysis framework for online prefetching and buffer management algorithms in parallel I/O systems, using a read-once model of block references. This has widespread applicability to key I/O-bound applications such as external merging and concurrent playback of multiple video streams. Two realistic lookahead models, global lookahead and local lookahead, are defined. Algorithms NOM and GREED based on these two forms of lookahead are analyzed for shared buffer and distributed buffer configurations, both of which occur frequently in existing systems. An important aspect of our work is that we show how to implement both the models of lookahead in practice using the simple techniques of forecasting and flushing. \par Given a D-disk parallel I/O system and a globally shared I/O buffer that can hold upto M disk blocks, we derive a lower bound of $\Omega(\sqrt{D}$) on the competitive ratio of any deterministic online prefetching algorithm with O(M) lookahead. NOM is shown to match the lower bound using global M-block lookahead. In contrast, using only local lookahead results in an $\Omega(D)$ competitive ratio. When the buffer is distributed into D portions of M/D blocks each, the algorithm GREED based on local lookahead is shown to be optimal, and NOM is within a constant factor of optimal. Thus we provide a theoretical basis for the intuition that global lookahead is more valuable for prefetching in the case of a shared buffer configuration whereas it is enough to provide local lookahead in case of the distributed configuration. Finally, we analyze the performance of these algorithms for reference strings generated by a uniformly-random stochastic process and we show that they achieve the minimal expected number of I/Os. These results also give bounds on the worst-case expected performance of algorithms which employ randomization in the data layout.}, comment = {See also barve:competitive. They propose two methods for scheduling prefetch operations in the situation where the access pattern is largely known in advance, in such a way as to minimize the total number of parallel I/Os. The two methods are quite straightforward, and yet match the optimum lower bound for an on-line algorithm.} } @Article{barve:jmergesort, author = {Rakesh D. Barve and Edward F. Grove and Jeffrey S. Vitter}, title = {Simple Randomized Mergesort on Parallel Disks}, journal = {Parallel Computing}, year = {1997}, month = {June}, volume = {23}, number = {4}, pages = {601--631}, publisher = {North-Holland (Elsevier Scientific)}, earlier = {barve:mergesort}, URL = {file://cs.duke.edu/pub/jsv/Papers/BGV96.Simple_Mergesort.ps.gz}, keyword = {parallel I/O algorithm, sorting, pario-bib}, abstract = {We consider the problem of sorting a file of N records on the D-disk model of parallel I/O in which there are two sources of parallelism. Records are transferred to and from disk concurrently in blocks of B contiguous records. In each I/O operation, up to one block can be transferred to or from each of the D disks in parallel. We propose a simple, efficient, randomized mergesort algorithm called SRM that uses a forecast-and-flush approach to overcome the inherent difficulties of simple merging on parallel disks. SRM exhibits a limited use of randomization and also has a useful deterministic version. Generalizing the technique of forecasting, our algorithm is able to read in, at any time, the ``right'' block from any disk, and using the technique of flushing, our algorithm evicts, without any I/O overhead, just the ``right'' blocks from memory to make space for new ones to be read in. The disk layout of SRM is such that it enjoys perfect write parallelism, avoiding fundamental inefficiencies of previous mergesort algorithms. By analysis of generalized maximum occupancy problems we are able to derive an analytical upper bound on SRM's expected overhead valid for arbitrary inputs. \par The upper bound derived on expected I/O performance of SRM indicates that SRM is provably better than disk-striped mergesort (DSM) for realistic parameter values D, M, and B. Average-case simulations show further improvement on the analytical upper bound. Unlike previously proposed optimal sorting algorithms, SRM outperforms DSM even when the number D of parallel disks is small.}, comment = {This paper formerly called barve:mergesort; I discovered that the paper had appeared in SPAA96, so the SPAA96 paper is now called barve:mergesort.} } @InProceedings{barve:mergesort, author = {Rakesh D. Barve and Edward F. Grove and Jeffrey S. Vitter}, title = {Simple Randomized Mergesort on Parallel Disks}, booktitle = {Proceedings of the Eighth Symposium on Parallel Algorithms and Architectures}, year = {1996}, month = {June}, pages = {109--118}, publisher = {ACM Press}, address = {Padua, Italy}, later = {barve:jmergesort}, keyword = {parallel I/O algorithm, sorting, pario-bib} } @InProceedings{barve:round, author = {Rakesh Barve and Phillip B. Gibbons and Bruce K. Hillyer and Yossi Matias and Elizabeth Shriver and Jeffrey Scott Vitter}, title = {Round-like Behavior in Multiple Disks on a Bus}, booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1999}, month = {May}, pages = {1--9}, publisher = {ACM Press}, address = {Atlanta, GA}, URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Shriver.ps}, keyword = {disk, I/O bus, parallel I/O, pario-bib}, abstract = {In modern I/O architectures, multiple disk drives are attached to each I/O bus. Under I/O-intensive workloads, the disk latency for a request can be overlapped with the disk latency and data transfers of requests to other disks, potentially resulting in an aggregate I/O throughput at nearly bus bandwidth. This paper reports on a performance impairment that results from a previously unknown form of convoy behavior in disk I/O, which we call rounds. In rounds, independent requests to distinct disks convoy, so that each disk services one request before any disk services its next request. We analyze log files to describe read performance of multiple Seagate Wren-7 disks that share a SCSI bus under a heavy workload, demonstrating the rounds behavior and quantifying its performance impact.} } @Article{batcher:staran, author = {K. E. Batcher}, title = {{STARAN} Parallel Processor System Hardware}, journal = {AFIPS Conference Proceedings}, year = {1974}, pages = {405--410}, keyword = {parallel architecture, array processor, parallel I/O, SIMD, pario-bib}, comment = {This paper is reproduced in Kuhn and Padua's (1981, IEEE) survey ``Tutorial on Parallel Processing.'' The STARAN is an array processor that uses Multi-Dimensional-Access (MDA) memories and permutation networks to access data in bit slices in a variety of ways, with high-speed I/O capabilities. Its router (called the {\em flip} network) could permute data among the array processors, or between the array processors and external devices, including disks, video input, and displays.} } @InProceedings{baylor:methodology, author = {Sandra Johnson Baylor and Caroline Benveniste and Leo J. Beolhouwer}, title = {A Methodology for Evaluating Parallel {I/O} Performance for Massively Parallel Processors}, booktitle = {Proceedings of the 27th Annual Simulation Symposium}, year = {1994}, month = {April}, pages = {31--40}, keyword = {parallel I/O, parallel architecture, simulation, pario-bib} } @InProceedings{baylor:perfeval, author = {Sandra Johnson Baylor and Caroline B. Benveniste and Yarson Hsu}, title = {Performance Evaluation of a Parallel {I/O} Architecture}, booktitle = {Proceedings of the 9th ACM International Conference on Supercomputing}, year = {1995}, month = {July}, pages = {404--413}, publisher = {ACM Press}, address = {Barcelona}, earlier = {baylor:perfeval-tr}, keyword = {performance evaluation, parallel architecture, parallel I/O, pario-bib}, comment = {They use a simulator to evaluate the performance of a parallel I/O system. They simulate the network and disks under a synthetic workload, and measure the time it takes for I/O requests to traverse the network, be processed, and return. They also measure the impact of I/O requests on non-I/O messages. Their results are fairly unsurprising.} } @TechReport{baylor:perfeval-tr, author = {Sandra Johnson Baylor and Caroline B. Benveniste and Yarson Hsu}, title = {Performance Evaluation of a Parallel {I/O} Architecture}, year = {1995}, month = {May}, number = {RC~20049}, institution = {IBM T.~J. Watson Research Center}, later = {baylor:perfeval}, keyword = {performance evaluation, parallel architecture, parallel I/O, pario-bib} } @InProceedings{baylor:vulcan-perf, author = {Sandra Johnson Baylor and Caroline Benveniste and Yarsun Hsu}, title = {Performance Evaluation of a Massively Parallel {I/O} Subsystem}, booktitle = {Proceedings of the IPPS~'94 Workshop on Input/Output in Parallel Computer Systems}, year = {1994}, pages = {1--15}, organization = {IBM Watson Research Center}, note = {Also appeared in Computer Architecture News 22(4)}, later = {baylor:vulcan-perf-book}, keyword = {parallel I/O, parallel architecture, performance analysis, pario-bib}, comment = {See polished version baylor:vulcan-perf-book. Simulation of the I/O architecture for the Vulcan MPP at IBM TJW. This is a distributed-memory MIMD system with a bidirectional omega-type interconnection network, and separate compute and I/O nodes. They use a stochastic workload to evaluate the average I/O performance under a few different situations, and then use that average performance, along with a stochastic workload, in a detailed simulation of the interconnection network. (What would be the effect of adding variance to the I/O-node performance?) A key point is that the I/O node will not accept any more requests until a current write request is finished being processed (copied into the write-back cache). If there are many writes, this can backup the network (would a different write-request protocol help?) Not clear how concurrency of reads are modeled. Results show that network saturates for high request rates and small number of I/O nodes. As request rate decreases or number of I/O nodes increases, performance levels off to a reasonable value. Placement of I/O nodes didn't make much difference, nor did extra non-I/O traffic. Given their parameters, and for reasonable loads, 1 I/O node per 4 compute nodes was a reasonable balance, and was scalable.} } @InCollection{baylor:vulcan-perf-book, author = {Sandra Johnson Baylor and Caroline Benveniste and Yarsun Hsu}, title = {Performance Evaluation of a Massively Parallel {I/O} Subsystem}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {13}, editor = {Ravi Jain and John Werth and James C. Browne}, crossref = {iopads-book}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {293--311}, publisher = {Kluwer Academic Publishers}, earlier = {baylor:vulcan-perf}, keyword = {parallel I/O architecture, performance evaluation, pario-bib}, abstract = {Presented are the trace-driven simulation results of a study conducted to evaluate the performance of the internal parallel I/O subsystem of the Vulcan massively parallel processor (MPP) architecture. The system sizes evaluated vary from 16 to 512 nodes. The results show that a compute node to I/O node ratio of four is the most cost effective for all system sizes, suggesting high scalability. Also, processor-to-processor communication effects are negligible for small message sizes and the greater the fraction of I/O reads, the better the I/O performance. Worse case I/O node placement is within 13\% of more efficient placement strategies. Introducing parallelism into the internal I/O subsystem improves I/O performance significantly.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @InProceedings{baylor:workload, author = {Sandra Johnson Baylor and C. Eric Wu}, title = {Parallel {I/O} Workload Characteristics Using {Vesta}}, booktitle = {Proceedings of the IPPS~'95 Workshop on Input/Output in Parallel and Distributed Systems}, year = {1995}, month = {April}, pages = {16--29}, later = {baylor:workload-book}, keyword = {parallel I/O, workload characterization, pario-bib}, abstract = {In recent years, the design and performance evaluation of parallel processors has focused on the processor, memory and communication subsystems. As a result, these subsystems have better performance potential than the I/O subsystem. In fact, the I/O subsystem is the bottleneck in many machines. However, there are a number of studies currently underway to improve the design of parallel I/O subsystems. To develop optimal parallel I/O subsystem designs, one must have a thorough understanding of the workload characteristics of parallel I/O and its exploitation of the associated parallel file system. Presented are the results of a study conducted to analyze the parallel I/O workloads of several applications on a parallel processor using the Vesta parallel file system. Traces of the applications are obtained to collect system events, communication events, and parallel I/O events. The traces are then analyzed to determine workload characteristics. The results show I/O request rates on the order of hundreds of requests per second, a large majority of requests are for small amount of data (less than 1500 bytes), a few requests are for large amounts of data (on the order of megabytes), significant file sharing among processes within a job, and strong temporal, traditional spatial, and interprocess spatial locality.}, comment = {See polished version baylor:workload-book. They characterize four parallel applications: sort, matrix multiply, seismic migration, and video server, in terms of their I/O activity. They found results that are consistent with kotz:workload, in that they also found lots of small data requests, some large data requests, significant file sharing and interprocess locality. This study found less of the non-contiguous access than did kotz:workload, because of the logical views provided by Vesta. Note on-line postscript does not include figures.} } @InCollection{baylor:workload-book, author = {Sandra Johnson Baylor and C. Eric Wu}, title = {Parallel {I/O} Workload Characteristics Using {Vesta}}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {7}, editor = {Ravi Jain and John Werth and James C. Browne}, crossref = {iopads-book}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {167--185}, publisher = {Kluwer Academic Publishers}, earlier = {baylor:workload}, keyword = {parallel I/O, file access pattern, workload characterization, file system workload, pario-bib}, abstract = {To develop optimal parallel I/O subsystems, one must have a thorough understanding of the workload characteristics of parallel I/O and its exploitation of the associated parallel file system. Presented are the results of a study conducted to analyze the parallel I/O workloads of several applications on a parallel processor using the Vesta parallel file system. Traces of the applications are obtained to collect system events, communication events, and parallel I/O events. The traces are then analyzed to determine workload characteristics. The results show I/O request rates on the order of hundreds of requests per second, a large majority of requests are for small amounts of data (less than 1500 bytes), a few requests are for large amounts of data (on the order of megabytes), significant file sharing among processes within a job, and strong temporal, traditional spatial, and interprocess spatial locality.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @Manual{bbn:admin, key = {BBN}, author = {BBN Advanced {Computers Inc.}}, title = {{TC2000} System Administration Guide}, edition = {Revision 3.0}, year = {1991}, month = {April}, keyword = {BBN, parallel I/O, pario-bib}, comment = {Administrative manual for the TC2000 I/O system. Can stripe over partitions in a user-specified set of disks. Large requests automatically split and done in parallel. See also garber:tc2000.} } @TechReport{becher:ooc-solver, author = {Jonathan D. Becher and John F. Porter}, title = {Out of Core Dense Solvers for the {MasPar} Parallel Computer}, year = {1994}, number = {MP/IP/SP-37.94}, institution = {MasPar Computer Corporation}, keyword = {parallel I/O, scientific computing, linear algebra, pario-bib}, comment = {They look at out-of-core block and slab solvers for the Maspar. They overlap reading one block with the computation of the previous block. They solve matrices up to 40k x 40k, and obtain 3.14 GFlops even with I/O considered.} } @InProceedings{bell:physics, author = {Jean L. Bell}, title = {A Specialized Data Management System for Parallel Execution of Particle Physics Codes}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data}, year = {1988}, pages = {277--285}, publisher = {ACM Press}, address = {Chicago, IL}, keyword = {file access pattern, disk prefetch, file system, pario-bib}, comment = {A specialized database system for particle physics codes. Valuable for its description of access patterns and subsequent file access requirements. Particle-in-cell codes iterate over timesteps, updating the position of each particle, and then the characteristics of each cell in the grid. Particles may move from cell to cell. Particle update needs itself and nearby gridcell data. The whole dataset is too big for memory, and each timestep must be stored on disk for later analysis anyway. Regular file systems are inadequate: specialized DBMS is more appropriate. Characteristics needed by their application class: multidimensional access (by particle type or by location, i.e., multiple views of the data), coordination between grid and particle data, coordination between processors, coordinated access to meta-data, inverted files, horizontal clustering, large blocking of data, asynchronous I/O, array data, complicated joins, and prefetching according to user-prespecified order. Note that many of these things can be provided by a file system, but that most are hard to come by in typical file systems, if not impossible. Many of these features are generalizable to other applications.} } @InProceedings{benner:pargraphics, author = {Robert E. Benner}, title = {Parallel Graphics Algorithms on a 1024-Processor Hypercube}, booktitle = {Proceedings of the Fourth Conference on Hypercube Concurrent Computers and Applications}, year = {1989}, pages = {133--140}, publisher = {Golden Gate Enterprises, Los Altos, CA}, address = {Monterey, CA}, keyword = {hypercube, graphics, parallel algorithm, parallel I/O, pario-bib}, comment = {About using the nCUBE/10's RT Graphics System. They were frustrated by an unusual mapping from the graphics memory to the display, a shortage of memory on the graphics nodes, and small message buffers on the graphics nodes. They wrote some algorithms for collecting the columns of pixels from the hypercube nodes, and routing them to the appropriate graphics node. They also would have liked a better interconnection network between the graphics nodes, at least for synchronization.} } @InProceedings{bennett:jovian, author = {Robert Bennett and Kelvin Bryant and Alan Sussman and Raja Das and Joel Saltz}, title = {{Jovian}: A Framework for Optimizing Parallel {I/O}}, booktitle = {Proceedings of the Scalable Parallel Libraries Conference}, year = {1994}, month = {October}, pages = {10--20}, publisher = {IEEE Computer Society Press}, address = {Mississippi State, MS}, URL = {ftp://hpsl.cs.umd.edu/pub/papers/splc94.ps.Z}, keyword = {parallel I/O, pario-bib}, comment = {Jovian is a runtime library for use with SPMD codes, eg, HPF. They restrict IO to collective operations, and provide extra processes to 'coalesce' the many requests from multiple CPs into fewer larger requests to the operating system, perhaps optimized for access order. They mention that there is a standardization process underway for specifying data distributions. Also a compact representation for strided access to n-dimensional data structures. Coalescing basically means combining requests to eliminate duplication and to combine adjacent requests. Requests to coalescers are in full blocks, to lower the processing overhead. Nonetheless, their method involves moving requests around twice, and involve several memory-memory copies of the data, so their overhead is high.} } @Misc{berdahl:transport, author = {Lawrence Berdahl}, title = {Parallel Transport Protocol Proposal}, year = {1995}, month = {January 3,}, howpublished = {Lawrence Livermore National Labs}, note = {Draft}, earlier = {berdahl:woodenman}, URL = {ftp://ftp.cs.dartmouth.edu/pub/pario/berdahl:transport.ps.Z}, keyword = {parallel I/O, network, supercomputer system, pario-bib}, comment = {An update of berdahl:woodenman, close to the final draft.} } @Misc{berdahl:woodenman, author = {Lawrence Berdahl}, title = {Parallel Data Exchange}, year = {1994}, month = {January 28,}, howpublished = {Lawrence Livermore National Labs}, note = {WoodenMan Proposal}, later = {berdahl:transport}, keyword = {parallel I/O, network, supercomputer system, pario-bib}, comment = {They describe a protocol for making parallel data transfers of arbitrary data sets from one set of data servers to another set of data servers. The goal is to be independent of specific architectures or even types of data servers, and to work on top of existing transport protocols. The data set is described using a gather set for the source and a scatter set for the destination, and using a linear address space as an intermediate representation. All the servers are contacted, they figure out who they need to talk, and exchange port information with them. Each pair exchanges votes on who will control the transfer (ie, who will control the order of the transfer), and on their maximum data rates. This information is used to settle on the control and set of ports to be used. This proposal is not final and is under active development, so it may change.} } @Article{berrendorf:paragon, author = {R. Berrendorf and H. Burg and U. Detert}, title = {Performance Characteristics of Parallel Computers: {Intel Paragon} Case Study}, journal = {{IT+TI} Informationstechnik und Technische Informatik}, year = {1995}, month = {April}, volume = {37}, number = {2}, pages = {37--45}, note = {(In German).}, keyword = {parallel computing, performance evaluation, parallel file system, pario-bib}, comment = {In German. They summarize typical performance of the Intel Paragon, including the communication performance and the parallel file-system performance.} } @TechReport{bershad:sio-os, author = {Brian Bershad and David Black and David DeWitt and Garth Gibson and Kai Li and Larry Peterson and Marc Snir}, title = {Operating System Support for High-Performance Parallel {I/O} Systems}, year = {1994}, number = {CCSF-40}, institution = {Scalable I/O Initiative}, address = {Caltech Concurrent Supercomputing Facilities, Caltech}, URL = {http://www.ccsf.caltech.edu/SIO/SIO_osfs.ps}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, comment = {Four major components: networking, memory servers, file system, and persistent object store. Networking part focuses on low-latency support communication within an application, between applications, and between machines (Bershad and Peterson). Memory servers, shared virtual memory, and checkpointing support (Kai Li). File systems support includes benchmarking, transparent informed prefetching (Gibson), a common interface for PFS and Vesta (Snir), and integrating secondary and tertiary storage systems (including the integration of the National Storage Lab's HPSS (see coyne:hpss) into this project in 1995). OSF/1 (Black) will be extended to support parallel file systems, extent-like behavior, and block coalescing. Persistent object store (DeWitt) is radical change to an object-oriented interface, transparent I/O (though extensible and changable with subclassing, presumably), and heterogeneous support via the Object Definition Language standard. Persistent objects may be integrated with the memory servers and shared virtual memory. See also poole:sio-survey, bagrodia:sio-character, choudhary:sio-language.} } @InProceedings{berson:multimedia, author = {Steven Berson and Leana Golubchik and Richard R. Muntz}, title = {Fault Tolerant Design of Multimedia Servers}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data}, year = {1995}, pages = {364--375}, publisher = {ACM Press}, keyword = {fault tolerance, multimedia, video on demand, parallel I/O, pario-bib} } @InProceedings{best:cmmdio, author = {Michael L. Best and Adam Greenberg and Craig Stanfill and Lewis W. Tucker}, title = {{CMMD I/O}: A Parallel {Unix I/O}}, booktitle = {Proceedings of the Seventh International Parallel Processing Symposium}, year = {1993}, pages = {489--495}, publisher = {IEEE Computer Society Press}, address = {Newport Beach, CA}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, comment = {Much like Intel CFS, with different I/O modes that determine when the compute nodes synchronize, and the semantics of I/Os written to the file. They found it hard to get good bandwidth for independent I/Os, as opposed to coordinated I/Os; part of this was due to their RAID~3 disk array, but it is more complicated than that. Some performance numbers were given in talk.} } @InProceedings{bestavros:raid, author = {Azer Bestavros}, title = {{IDA}-Based Redundant Arrays of Inexpensive Disks}, booktitle = {Proceedings of the First International Conference on Parallel and Distributed Information Systems}, year = {1991}, month = {December}, pages = {2--9}, keyword = {RAID, disk array, reliability, parallel I/O, pario-bib}, comment = {Uses the Information Dispersal Algorithm (IDA) to generate $n+m$ blocks from $n$ blocks, to tolerate $m$ disk failures; all of the data from the $n$ blocks is hidden in the $n+m$ blocks. Not with the RAID project.} } @InProceedings{bester:gass, author = {Joseph Bester and Ian Foster and Carl Kesselman and Jean Tedesco and Steven Tuecke}, title = {{GASS}: A Data Movement and Access Service for Wide Area Computing Systems}, booktitle = {Proceedings of the Sixth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1999}, month = {May}, pages = {78--88}, publisher = {ACM Press}, address = {Atlanta, GA}, URL = {http://vibes.cs.uiuc.edu/IOPADS/Accepted/Tedesco.ps}, keyword = {wide-area network, parallel I/O, pario-bib}, abstract = {In wide area computing, programs frequently execute at sites that are distant from their data. Data access mechanisms are required that place limited functionality demands on an application or host system yet permit high-performance implementations. To address these requirements, we propose a data movement and access service called Global Access to Secondary Storage (GASS). This service defines a global name space via Uniform Resource Locators and allows applications to access remote files via standard I/O interfaces. High performance is achieved by incorporating default data movement strategies that are specialized for I/O patterns common in wide area applications and by providing support for programmer management of data movement. GASS forms part of the Globus toolkit, a set of services for high-performance distributed computing. GASS itself makes use of Globus services for security and communication, and other Globus components use GASS services for executable staging and real-time remote monitoring. Application experiences demonstrate that the library has practical utility.} } @InProceedings{bitton:schedule, author = {Dina Bitton}, title = {Arm Scheduling in Shadowed Disks}, booktitle = {Proceedings of IEEE Compcon}, year = {1989}, month = {Spring}, pages = {132--136}, keyword = {parallel I/O, disk shadowing, reliability, disk mirroring, disk optimization, pario-bib}, comment = {Goes further than bitton:shadow. Uses simulation to verify results from that paper, which were expressions for the expected seek distance of shadowed disks, using shortest-seek-time arm scheduling. Problem is her assumption that arm positions stay independent, in the face of correlating effects like writes, which move all arms to the same place. Simulations match model only barely, and only in some cases. Anyway, shadowed disks can improve performance for workloads more than 60 or 70\% reads.} } @InProceedings{bitton:shadow, author = {D. Bitton and J. Gray}, title = {Disk Shadowing}, booktitle = {Proceedings of the 14th International Conference on Very Large Data Bases}, year = {1988}, pages = {331--338}, keyword = {parallel I/O, disk shadowing, reliability, disk mirroring, disk optimization, pario-bib}, comment = {Also TR UIC EECS 88-1 from Univ of Illinois at Chicago. Shadowed disks are mirroring with more than 2 disks. Writes to all disks, reads from one with shortest seek time. Acknowledges but ignores problem posed by lo:disks. Also considers that newer disk technology does not have linear seek time $(a+bx)$ but rather $(a+b\sqrt{x})$. Shows that with either seek distribution the average seek time for workloads with at least 60\% reads decreases in the number of disks. See also bitton:schedule.} } @InProceedings{bjorstad:structure, author = {P. E. Bj{\o}rstad and J. Cook}, title = {Large Scale Structural Analysis On Massively Parallel Computers}, booktitle = {Linear Algebra for Large Scale and Real-Time Applications}, year = {1993}, pages = {3--11}, publisher = {Kluwer Academic Publishers}, note = {ftp from ftp.ii.uib.no in \verb+pub/tech_reports/mpp_sestra.ps.Z+.}, URL = {file://ftp.ii.uib.no/pub/tech_reports/mpp_sestra.ps.Z}, keyword = {parallel I/O, file access pattern, pario-bib}, comment = {A substantial part of this structural-analysis application was involved in I/O, moving substructures in and out of RAM. The Maspar IO-RAM helped a lot, nearly halving the time required. On the Cray, the SSD had an even bigger impact, perhaps 7--12 times faster. Their main conclusion is that caching helped. Most likely this was due to its double-buffering, since they structured the code to read/compute/write in large ``superblocks''.} } @Article{boral:bubba, author = {Haran Boral and William Alexander and Larry Clay and George Copeland and Scott Danforth and Michael Franklin and Brian Hart and Marc Smith and Patrick Valduriez}, title = {Prototyping {Bubba}, a Highly Parallel Database System}, journal = {IEEE Transactions on Knowledge and Data Engineering}, year = {1990}, month = {March}, volume = {2}, number = {1}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, database, disk caching, pario-bib}, comment = {More recent than copeland:bubba, and a little more general. This gives few details, and doesn't spend much time on the parallel I/O. Bubba does use parallel independent disks, with a significant effort to place data on the disks, and do the work local to the disks, to balance the load and minimize interprocessor communication. Also they use a single-level store (i.e., memory-mapped files) to improve performance of their I/O system, including page locking that is assisted by the MMU. The OS has hooks for the database manager to give memory-management policy hints.} } @InProceedings{boral:critique, author = {H. Boral and D. {DeWitt}}, title = {Database machines: an idea whose time has passed?}, booktitle = {Proceedings of the Second International Workshop on Database Machines}, year = {1983}, pages = {166--187}, publisher = {Springer-Verlag}, keyword = {file access pattern, parallel I/O, database machine, pario-bib}, comment = {Improvements in I/O bandwidth crucial for supporting database machines, otherwise highly parallel DB machines are useless (I/O bound). Two ways to do it: 1) synchronized interleaving by using custom controller and regular disks to read/write same track on all disks, which speeds individual accesses. 2) use very large cache (100-200M) to keep blocks to re-use and to do prefetching. But see dewitt:pardbs.} } @InProceedings{bordawekar:collective, author = {Rajesh Bordawekar}, title = {Implementation of Collective {I/O} in the {Intel Paragon} Parallel File System: Initial Experiences}, booktitle = {Proceedings of the 11th ACM International Conference on Supercomputing}, year = {1997}, month = {July}, pages = {20--27}, publisher = {ACM Press}, earlier = {bordawekar:collective-tr}, URL = {http://www.cacr.caltech.edu/~rajesh/ics97.ps}, keyword = {collective I/O, multiprocessor file system, parallel I/O, pario-bib}, comment = {bordawekar:collective was renamed bordawekar:collective-tr, so this could be called bordawekar:collective.} } @TechReport{bordawekar:collective-tr, author = {Rajesh Bordawekar}, title = {Implementation and Evaluation of Collective {I/O} in the {Intel Paragon Parallel File System}}, year = {1996}, month = {November}, number = {CACR~TR-128}, institution = {Center of Advanced Computing Research, California Insititute of Technology}, later = {bordawekar:collective}, URL = {http://www.cacr.caltech.edu/~rajesh/collective.html}, keyword = {parallel I/O, mutliprocessor file system, pario-bib}, abstract = {A majority of parallel applications obtain parallelism by partitioning data over multiple processors. Accessing distributed data structures like arrays from files often requires each processor to make a large number of small non-contiguous data requests. This problem can be addressed by replacing small non-contiguous requests by large collective requests. This approach, known as Collective I/O, has been found to work extremely well in practice. In this paper, we describe implementation and evaluation of a collective I/O prototype in a production parallel file system on the Intel Paragon. The prototype is implemented in the PFS subsystem of the Intel Paragon Operating System. We evaluate the collective I/O performance using its comparison with the PFS M_RECORD and M_UNIX I/O modes. It is observed that collective I/O provides significant performance improvement over accesses in M_UNIX mode. However, in many cases, various implementation overheads cause collective I/O to provide lower performance than the M_RECORD I/O mode.}, comment = {This tech report was called bordawekar:collective, then renamed bordawekar:collective-tr, on the appearance of the ICS paper bordawekar:collective.} } @InProceedings{bordawekar:comm, author = {Rajesh Bordawekar and Alok Choudhary}, title = {Communication Strategies for Out-of-core Programs on Distributed Memory Machines}, booktitle = {Proceedings of the 9th ACM International Conference on Supercomputing}, year = {1995}, month = {July}, pages = {395--403}, publisher = {ACM Press}, address = {Barcelona}, earlier = {bordawekar:comm-tr}, keyword = {parallel I/O, inter-processor communication, pario-bib}, comment = {bordawekar:comm-tr is nearly identical in content. Also bordawekar:commstrat is a shorter version.} } @TechReport{bordawekar:comm-tr, author = {Rajesh Bordawekar and Alok Choudhary}, title = {Communication Strategies for Out-of-core Programs on Distributed Memory Machines}, year = {1994}, number = {SCCS-667}, institution = {NPAC, Syracuse University}, later = {bordawekar:comm}, URL = {http://www.npac.syr.edu/pub/by_index/sccs/papers/ps/0660/sccs-0667.ps.Z}, keyword = {parallel I/O, inter-processor communication, pario-bib}, abstract = {In this paper, we show that communication in the out-of-core distributed memory problems requires both inter-processor communication and file I/O. Given that primary data structures reside in files, even communication requires I/O. Thus, it is important to optimize the I/O costs associated with a communication step. We present three methods for performing communication in out-of-core distributed memory problems. The first method, termed as the "out-of-core" communication method, follows a loosely synchronous model. Computation and Communication phases in this case are clearly separated, and communication requires permutation of data in files. The second method, termed as "demand-driven-in-core communication" considers only communication required of each in-core data slab individually. The third method, termed as "producer-driven-in-core communication" goes even one step further and tries to identify the potential (future) use of data while it is in memory. We describe these methods in detail and provide performance results for out-of-core applications; namely, two-dimensional FFT and two-dimensional elliptic solver. Finally, we discuss how "out-of-core" and "in-core" communication methods could be used in virtual memory environments on distributed memory machines.}, comment = {They compare different ways to do global communications in out-of-core applications, involving file I/O and communication at different times. They also comment briefly on how it would work if it depended on virtual memory at each node.} } @InProceedings{bordawekar:commstrat, author = {Rajesh Bordawekar and Alok Choudhary}, title = {Communication strategies for out-of-core programs on distributed memory machines}, booktitle = {Proceedings of the 1995 International Conference on High Performance Computing}, year = {1995}, month = {December}, pages = {130--135}, address = {New Delhi, India}, earlier = {bordawekar:comm}, keyword = {interprocessor communication, parallel I/O, pario-bib}, comment = {Small version of bordawekar:comm.} } @Article{bordawekar:compcomm, author = {Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {Compilation and Communication Strategies for Out-of-core programs on Distributed-Memory Machines}, journal = {Journal of Parallel and Distributed Computing}, year = {1996}, month = {November}, volume = {38}, number = {2}, pages = {277--288}, publisher = {Academic Press}, earlier = {bordawekar:compcomm-tr}, keyword = {compiler, communication, out-of-core, parallel I/O, inter-processor communication, pario-bib}, abstract = {It is widely acknowledged that improving parallel I/O performance is critical for widespread adoption of high performance computing. In this paper, we show that communication in out-of-core distributed memory problems may require both inter-processor communication and file I/O. Thus, in order to improve I/O performance, it is necessary to minimize the I/O costs associated with a communication step. We present three methods for performing communication in out-of-core distributed memory problems. The first method called the generalized collective communication method follows a loosely synchronous model; computation and communication phases are clearly separated, and communication requires permutation of data in files. The second method called the receiver-driven in-core communication considers only communication required of each in-core data slab individually. The third method called the owner-driven in-core communication goes even one step further and tries to identify the potential future use of data (by the recipients) while it is in the sender's memory. We describe these methods in detail and present a simple heuristic to choose a communication method from among the three methods. We then provide performance results for two out-of-core applications, the two-dimensional FFT code and the two-dimensional elliptic Jacobi solver. Finally, we discuss how the out-of-core and in-core communication methods can be used in virtual memory environments on distributed memory machines.} } @TechReport{bordawekar:compcomm-tr, author = {Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {Compilation and Communication Strategies for Out-of-core programs on Distributed Memory Machines}, year = {1995}, month = {November}, number = {CACR-113}, institution = {Scalable I/O Initiative, Center of Advanced Computing Research, California Insititute of Technology}, later = {bordawekar:compcomm}, URL = {http://www.cat.syr.edu/~rajesh/cacr113.ps}, abstract = {It is widely acknowledged that improving parallel I/O performance is critical for widespread adoption of high performance computing. In this paper, we show that communication in out-of-core distributed memory problems may require both inter-processor communication and file I/O. Thus, in order to improve I/O performance, it is necessary to minimize the I/O costs associated with a communication step. We present three methods for performing communication in out-of-core distributed memory problems. The first method called the generalized collective communication method follows a loosely synchronous model; computation and communication phases are clearly separated, and communication requires permutation of data in files. The second method called the receiver-driven in-core communication considers only communication required of each in-core data slab individually. The third method called the owner-driven in-core communication goes even one step further and tries to identify the potential future use of data (by the recipients) while it is in the sender's memory. We describe these methods in detail and present a simple heuristic to choose a communication method from among the three methods. We then provide performance results for two out-of-core applications, the two-dimensional FFT code and the two-dimensional elliptic Jacobi solver. Finally, we discuss how the out-of-core and in-core communication methods can be used in virtual memory environments on distributed memory machines.}, comment = {See also bordawekar:comm, at ICS'95.} } @InCollection{bordawekar:compiling, author = {Rajesh Bordawekar and Alok Choudhary}, title = {Issues in Compiling {I/O} Intensive Problems}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {3}, editor = {Ravi Jain and John Werth and James C. Browne}, crossref = {iopads-book}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {69--96}, publisher = {Kluwer Academic Publishers}, keyword = {parallel I/O, compiler, out-of-core, pario-bib}, abstract = {None.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @TechReport{bordawekar:compositional, author = {Rajesh Bordawekar}, title = {A Case for Compositional File Systems (Extended Abstract)}, year = {1998}, month = {March}, number = {CACR TR-161}, institution = {Center of Advanced Computing Research, California Insititute of Technology}, URL = {http://www.cacr.caltech.edu/~rajesh/beowulf-fs.html}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, abstract = {This article presents a case for compositional file systems (CFSs). The CFS is designed using the end-to-end argument; the basic file system attributes, therefore, are independent of the user requirements. The CFS is designed as a functionally compositional, structurally distributed, and dynamically extendable file system. The article also discusses the advantages and implementation alternatives for these file systems, and outlines possible applications.} } @InProceedings{bordawekar:delta-fs, author = {Rajesh Bordawekar and Alok Choudhary and Juan Miguel Del Rosario}, title = {An Experimental Performance Evaluation of {Touchstone Delta Concurrent File System}}, booktitle = {Proceedings of the 7th ACM International Conference on Supercomputing}, year = {1993}, pages = {367--376}, publisher = {ACM Press}, earlier = {bordawekar:delta-fs-TR}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/ics93.ps.Z}, keyword = {performance evaluation, multiprocessor file system, parallel I/O, pario-bib}, abstract = {For a high-performance parallel machine to be a scalable system, it must also have a scalable parallel I/O system. Recently, several commercial machines (e.g. Intel Touchstone Delta, Paragon, CM-5, Ncube-2) have been built that provide features for parallel I/O. However, very little is understood about the performance of these I/O systems. This paper presents an experimental evaluation of the Intel Touchstone Delta's Concurrent File System (CFS). The CFS utilizes the declustering of large files across the disks to improve the I/O performance. Data files can be read or written on the CFS using 4 access modes. \par We present performance measurements for the CFS on the Touchstone Delta with 512 compute nodes and 32 I/O nodes. The study focuses on file read/write rates for various configurations of I/O and compute nodes. The study attempts to show the effect of access modes, buffer sizes and volume restrictions on the system performance. The paper also shows that the performance of the CFS can greatly vary for various data distributions commonly employed in scientific and engineering applications.}, comment = {Some new numbers over bordawekar:delta-fs-TR, but basically the same conclusions.} } @TechReport{bordawekar:delta-fs-TR, author = {Rajesh Bordawekar and Alok Choudhary and Juan Miguel Del Rosario}, title = {An Experimental Performance Evaluation of {Touchstone Delta Concurrent File System}}, year = {1992}, number = {SCCS-420}, institution = {NPAC, Syracuse University}, later = {bordawekar:delta-fs}, keyword = {performance evaluation, multiprocessor file system, parallel I/O, pario-bib}, comment = {Evaluating the Caltech Touchstone Delta (512 nodes, 32 I/O nodes, 64 disks, 8 MB cache per I/O node). Basic measurements of different access patterns and I/O modes. Location in network doesn't seem to matter. Throughput is often limited by the software; at least, the full hardware throughputs are rarely obtained. Sometimes they are compnode-limited, and other times they may be being limited by the cache management. There must be a way to push bottleneck back to the disks .} } @TechReport{bordawekar:efficient, author = {Rajesh Bordawekar and Rajeev Thakur and Alok Choudhary}, title = {Efficient Compilation of Out-of-core Data Parallel Programs}, year = {1994}, month = {April}, number = {SCCS-622}, institution = {NPAC}, later = {bordawekar:reorganize}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/access_reorg.ps.Z}, keyword = {parallel I/O, compiler, pario-bib}, abstract = {Large scale scientific applications, such as the Grand Challenge applications, deal with very large quantities of data. The amount of main memory in distributed memory machines is usually not large enough to solve problems of realistic size. This limitation results in the need for system and application software support to provide efficient parallel I/O for out-of-core programs. This paper describes techniques for translating out-of-core programs written in a data parallel language like HPF to message passing node programs with explicit parallel I/O. We describe the basic compilation model and various steps involved in the compilation. The compilation process is explained with the help of an out-of-core matrix multiplication program. We first discuss how an out-of-core program can be translated by extending the method used for translating in-core programs. We then describe how the compiler can optimize the code by estimating the I/O costs associated with different array access patterns and selecting the method with the least I/O cost. This optimization can reduce the amount of I/O by as much as an order of magnitude. Performance results on the Intel Touchstone Delta are presented and analyzed.}, comment = {Revised as bordawekar: This is actually fairly different from thakur:runtime. They describe the same basic compiler technique, where arrays are distributed across processors, and each processor has a local array file for holding data from its local partitions. Then the I/O needed for a loop is broken into slabs, where the program proceeds as an alternation of (read slabs, compute, write slabs). The big new thing here is that the compiler tries different ways to form slabs (e.g., by row or by column), estimates the number of I/Os and the amount of data moved for each case, and chooses the case with the smallest amount of I/O. They also mention how the choice of memory size allocated to different arrays affects the amount of IO, but give no algorithm other than "try all the possibilities."} } @Article{bordawekar:exemplar, author = {Rajesh Bordawekar and Steven Landherr and Don Capps and Mark Davis}, title = {Experimental Evaluation of the {Hewlett-Packard Exemplar} File System}, journal = {ACM SIGMETRICS Performance Evaluation Review}, year = {1997}, month = {December}, volume = {25}, number = {3}, pages = {21--28}, keyword = {multiprocessor file system, performance evaluation, parallel I/O, pario-bib}, comment = {Part of a special issue on parallel and distributed I/O.} } @TechReport{bordawekar:exemplar-tr2, author = {Rajesh Bordawekar}, title = {Quantitative Characterization and Analysis of the {I/O} Behavior of a Commercial Distributed-shared-memory Machine}, year = {1998}, month = {March}, number = {CACR 157}, institution = {Center of Advanced Computing Research, California Insititute of Technology}, URL = {http://www.cacr.caltech.edu/~rajesh/exemplar1.html}, keyword = {parallel I/O, pario-bib, workload characterization, distributed shared memory}, abstract = {This paper presents a unified evaluation of the I/O behavior of a commercial clustered DSM machine, the HP Exemplar. Our study has the following objectives: (1) To evaluate the impact of different interacting system components, namely, architecture, operating system, and programming model, on the overall I/O behavior and identify possible performance bottlenecks and (2) To provide hints to the users for achieving high out-of-box I/O throughput. We find that for the DSM machines that are built as a cluster of SMP nodes, integrated clustering of computing and I/O resources, both hardware and software, is not advantageous for two reasons. First, within an SMP node, the I/O bandwidth is often restricted by the performance of the peripheral components and cannot match the memory bandwidth. Second, since the I/O resources are shared as a global resource, the file-access costs become non-uniform and the I/O behavior of the entire system, in terms of the scalability and balance, degrades. We observe that the buffered I/O performance is determined not only by the I/O subsystem, but also by the programming model, global-shared memory subsystem, and data-communication mechanism. Moreover, programming-model support can be effectively used to overcome the performance constraints created by the architecture and operating system. For example, on the HP Exemplar, users can achieve high I/O throughput by using features of the programming model that balance the sharing and locality of the user buffers and file systems. Finally, we believe that at present, the I/O subsystems are being designed in isolation and there is a need for mending the traditional memory-oriented design approach to address this problem.} } @TechReport{bordawekar:framework, author = {Rajesh Bordawekar and Alok Choudhary}, title = {A Framework for Representing Data Parallel Programs and its Application in Program Reordering}, year = {1995}, month = {March}, number = {SCCS-698}, institution = {NPAC, Syracuse University}, URL = {http://www.npac.syr.edu/techreports/html/0650/abs-0698.html}, keyword = {data parallel, parallel I/O, pario-bib}, comment = {Although this is mostly a compilers paper, there is a little bit about parallel I/O here. They comment briefly on how their compiler framework will help them make a compiler that can provide advice to the file system about prefetching and cache replacement, and to decide on the layout of scratch files to optimize locality.} } @TechReport{bordawekar:hpf, author = {Rajesh Bordawekar and Alok Choudhary}, title = {{HPF} with Parallel {I/O} Extensions}, year = {1993}, number = {SCCS-613}, institution = {NPAC, Syracuse University}, URL = {http://www.npac.syr.edu/techreports/ps/0600/sccs-0613.ps.Z}, keyword = {parallel I/O, pario-bib}, comment = {They propose some extensions to HPF to accomodate parallel I/O.} } @TechReport{bordawekar:hpfio, author = {Rajesh Bordawekar and Alok Choudhary}, title = {Extending {I/O} Capabilities of {High Performance Fortran}: Initial Experiences}, year = {1995}, month = {December}, number = {CACR-115}, institution = {Scalable I/O Initiative, Center of Advanced Computing Research, California Insititute of Technology}, keyword = {parallel I/O, compiler, FORTRAN, HPF, pario-bib}, abstract = {This report presents implementation details of the prototype PASSION compiler. The PASSION compiler provides support for: (1) Accessing multidimensional in-core arrays and (2) Out-of-core computations. The PASSION compiler takes as input an annotated I/O intensive (either an out-of-core program or program accessing distributed arrays from files) High Performance Fortran (HPF) program. Using hints provided by the user, the compiler modifies the computation so as to minimize the I/O cost and restructures the program to incorporate explicit I/O calls. In this report, compilation of out-of-core FORALL constructs is illustrated using representative programs. Compiler support for accessing distributed in-core data is explained using illustrative examples and supplemented by experimental results.}, comment = {Currently not available on WWW. Describes implementation details of the PASSION Compiler.} } @InProceedings{bordawekar:model, author = {Rajesh Bordawekar and Alok Choudhary and Ken Kennedy and Charles Koelbel and Michael Paleczny}, title = {A Model and Compilation Strategy for Out-of-core Data Parallel Programs}, booktitle = {Proceedings of the Fifth ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, year = {1995}, month = {July}, pages = {1--10}, publisher = {ACM Press}, address = {Santa Barbara, CA}, note = {Also available as the following technical reports: NPAC Technical Report SCCS-0696, CRPC Technical Report CRPC-TR94507-S, SIO Technical Report CACR SIO-104}, earlier = {bordawekar:model-tr}, URL = {http://www.cacr.caltech.edu/techpubs/PAPERS/cacr104.ps}, keyword = {parallel I/O, compiler, pario-bib}, abstract = {It is widely acknowledged in high-performance computing circles that parallel input/output needs substantial improvement in order to make scalable computers truly usable. We present a data storage model that allows processors independent access to their own data and a corresponding compilation strategy that integrates data-parallel computation with data distribution for out-of-core problems. Our results compare several communication methods and I/O optimizations using two out-of-core problems, Jacobi iteration and LU factorization.} } @TechReport{bordawekar:model-tr, author = {Rajesh Bordawekar and Alok Choudhary and Ken Kennedy and Charles Koebel and Mike Paleczny}, title = {A Model and Compilation Strategy for Out-of-Core Data Parallel Programs}, year = {1994}, month = {December}, number = {CRPC-TR94507-S}, institution = {CRPC}, later = {bordawekar:model}, URL = {gopher://softlib.rice.edu/99/softlib/CRPC-TRs/reports/CRPC-TR94507-S.ps}, keyword = {compilers, parallel I/O, out-of-core applications, pario-bib}, comment = {Basically a summary of their I/O and compilation model for out-of-core compilation of HPF programs. See also paleczny:support.} } @MastersThesis{bordawekar:msthesis, author = {Rajesh R. Bordawekar}, title = {Issues in Software Support for Parallel {I/O}}, year = {1993}, month = {May}, school = {Syracuse University}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/msthesis.ps.Z}, keyword = {parallel I/O, pario-bib}, abstract = {This thesis looks at various issues in providing application-level software support for parallel I/O. We show that the performance of the parallel I/O system varies greatly as a function of data distributions. We present runtime I/O primitives for parallel languages which allow the user to obtain a consistent performance over a wide range of data distributions. \par In order to design these primitives, we study various parameters used in the design of a parallel file system. We evaluate the performance of Touchstone Delta Concurrent File System and study the effect of parameters like number of processors, number of disks, file size on the system performance. We compute the I/O costs for common data distributions. We propose an alternative strategy -two phase data access strategy- to optimize the I/O costs connected with data distributions. We implement runtime primitives using the two-phase access strategy and show that using these primitives not only I/O access rates are improved but also user can obtain complex data distributions like block-block and block-cyclic.}, comment = {This is basically a consolidation of the other bordawekar papers, in more detail. So he covers an experimental analysis of the touchstone delta; of the problems arising from the direct-access model for non-conforming distributions; of the two-phase model; and of the run-time library to support two-phase access. See also bordawekar:reorganize, thakur:runtime, bordawekar:efficient, thakur:out-of-core, delrosario:two-phase, bordawekar:primitives, bordawekar:delta-fs.} } @InProceedings{bordawekar:placement, author = {Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {A Framework for Integrated Communication and {I/O} Placement}, booktitle = {Proceedings of the 2nd International Euro-Par'96, Parallel Processing}, year = {1996}, month = {August}, series = {Lecture Notes in Computer Science}, volume = {1124}, pages = {541--552}, publisher = {Springer-Verlag}, earlier = {bordawekar:placement-tr}, URL = {http://www.cacr.caltech.edu/~rajesh/europar-rajesh.ps}, keyword = {parallel I/O, compiler, pario-bib}, abstract = {This paper describes a framework for analyzing dataflow within an out-of-core parallel program. Dataflow properties of FORALL statement are analyzed and a unified I/O and communication placement framework is presented. This placement framework can be applied to many problems, which include eliminating redudant I/O incurred in communication. The framework is validated by applying it for optimizing I/O and communication in out-of-core stencil problems. Experimental performance results on an Intel Paragon show significant reduction in I/O and communication overhead.} } @TechReport{bordawekar:placement-tr, author = {Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {A Framework for Integrated Communication and {I/O} Placement}, year = {1996}, month = {February}, number = {CACR-117}, institution = {Scalable I/O Initiative, Center of Advanced Computing Research, California Insititute of Technology}, later = {bordawekar:placement}, URL = {http://www.cacr.caltech.edu/~rajesh/cacr117.ps}, keyword = {parallel I/O, compiler, pario-bib}, abstract = {In this paper, we describe a framework for optimizing communication and I/O costs in out-of-core problems. We focus on communication and I/O optimization within a FORALL construct. We show that existing frameworks do not extend directly to out-of-core problems and can not exploit the FORALL semantics. We present a unified framework for the placement of I/O and communication calls and apply it for optimizing communication for stencil applications. Using the experimental results, we demonstrate that correct placement of I/O and communication calls can completely eliminate extra file I/O from communication and obtain significant performance improvement.} } @InProceedings{bordawekar:primitives, author = {Rajesh Bordawekar and Juan Miguel {del Rosario} and Alok Choudhary}, title = {Design and Evaluation of Primitives for Parallel {I/O}}, booktitle = {Proceedings of Supercomputing '93}, year = {1993}, pages = {452--461}, publisher = {IEEE Computer Society Press}, address = {Portland, OR}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/sc93.ps.Z}, keyword = {parallel I/O, pario-bib}, abstract = {In this paper, we show that the performance of parallel file systems can vary greatly as a function of the selected data distributions, and that some data distributions can not be supported. Also, we describe how the parallel language extensions, though simplifying the programming, do not address the performance problems found in parallel file systems. \par We have devised an alternative scheme for conducting parallel I/O - the Two-Phase Access Strategy - which guarantees higher and more consistent performance over a wider spectrum of data distributions. We have designed and implemented runtime primitives that make use of the two-phase access strategy to conduct parallel I/O, and facilitate the programming of parallel I/O operations. We describe these primitives in detail and provide performance results which show that I/O access rates are improved by up to several orders of magnitude. Further, we show that the variation in performance over various data distributions is restricted to within a factor of 2 of the best access rate.}, comment = {Much of this is the same as delrosario:two-phase, except for section~4 where they describe their actual run-time library of primitives, with a little bit about how it works. It's not clear, for example, how their meta-data structures are distributed across the machine. They also do not describe their methods for the data redistribution.} } @TechReport{bordawekar:reorganize, author = {Rajesh Bordawekar and Alok Choudhary and Rajeev Thakur}, title = {Data Access Reorganizations in Compiling Out-of-core Data Parallel Programs on Distributed Memory Machines}, year = {1994}, month = {September}, number = {SCCS-622}, institution = {NPAC}, address = {Syracuse, NY 13244}, earlier = {bordawekar:efficient}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/access_reorg.ps.Z}, keyword = {parallel I/O, compilation, pario-bib}, comment = {Basically they give a case study of out-of-core matrix multiplication to emphasize that the compiler's choice of loop ordering and matrix distribution for in-core matmult is not a very good choice for out-of-core matmult, because it causes too much I/O. By reorganizing the data and the loops, they get much better performance. In this particular case there are known algorithms which they should have used. In general they make the point that the compiler should consider several organizations, and estimate their costs, before generating code. They don't propose anything more sophisticated than to try all the possible organizations.} } @InProceedings{bordawekar:stencil, author = {Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {Automatic Optimization of Communication in Compiling Out-of-core Stencil Codes}, booktitle = {Proceedings of the 10th ACM International Conference on Supercomputing}, year = {1996}, month = {May}, pages = {366--373}, publisher = {ACM Press}, address = {Philadelphia, PA}, earlier = {bordawekar:stencil-tr}, URL = {http://www.cat.syr.edu/~rajesh/ics96.ps}, keyword = {compiler, parallel I/O, pario-bib}, abstract = {In this paper, we describe a technique for optimizing commununication for out-of-core distributed memory stencil problems. In these problems, communication may require both inter-processor communication and file I/O. We show that in certain cases, extra file I/O incurred in communication can be completely eliminated by reordering in-core computations. The in-core computation pattern is decided by: (1) how the out-of-core data distributed into in-core slabs (tiling) and (2) how the slabs are accessed. We show that a compiler using the stencil and processor information can choose the tiling parameters and schedule the tile accesses so that the extra file I/O is eliminated and overall performance is improved.} } @TechReport{bordawekar:stencil-tr, author = {Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {Automatic Optimization of Communication in Out-of-core Stencil Codes}, year = {1995}, month = {November}, number = {CACR-114}, institution = {Scalable I/O Initiative, Center of Advanced Computing Research, California Insititute of Technology}, later = {bordawekar:stencil}, keyword = {compiler, parallel I/O, pario-bib}, abstract = {In this paper, we describe a technique for optimizing commununication for out-of-core distributed memory stencil problems. In these problems, communication may require both inter-processor communication and file I/O. We show that in certain cases, extra file I/O incurred in communication can be completely eliminated by reordering in-core computations. The in-core computation pattern is decided by: (1) how the out-of-core data distributed into in-core slabs (tiling) and (2) how the slabs are accessed. We show that a compiler using the stencil and processor information can choose the tiling parameters and schedule the tile accesses so that the extra file I/O is eliminated and overall performance is improved.} } @InProceedings{bordawekar:support, author = {Rajesh Bordawekar and Alok Choudhary}, title = {Compiler and Runtime Support For Parallel {I/O}}, booktitle = {Proceedings of IFIP Working Conference (WG10.3) on Programming Environments for Massively Parallel Distributed Systems}, year = {1994}, month = {April}, publisher = {Birkhaeuser Verlag AG, Basel, Switzerland}, address = {Monte Verita, Ascona, Switzerland}, keyword = {parallel I/O, pario-bib}, comment = {Contains much of the material from bordawekar:hpf.} } @PhdThesis{bordawekar:thesis, author = {Rajesh Bordawekar}, title = {Techniques for Compiling {I/O} Intensive Parallel Programs}, year = {1996}, month = {April}, school = {Electrical and Computer Engineering Dept., Syracuse University}, note = {Also available as Caltech technical report CACR-118}, URL = {http://www.cat.syr.edu/~rajesh/thesis.html}, keyword = {parallel I/O, compiler, HPF, pario-bib}, abstract = {This dissertation investigates several issues in providing compiler support for I/O intensive parallel programs. In this dissertation, we focus on satisfying two I/O requirements, namely, support for accessing multidimensional arrays and support for {\it out-of-core} computations. We analyze working spaces in I/O intensive programs and propose three execution models to be used by users or compilers for developing efficient I/O intensive parallel programs. Different phases in compiling out-of-core parallel programs are then described. Three different methods for performing communication are presented and validated using representative application templates.We illustrate that communication in out-of-core programs may require both inter-processor communication and file I/O. We show that using the {\it copy-in-copy-out} semantics of the HPF {\tt FORALL} construct, extra file I/O incurred in communication can be completely eliminated by reordering in-core computations. Two different approaches for reordering in-core computations are presented, namely, integrated tiling and scheduling heuristic, and dataflow framework for placing communication and I/O calls. The discussion is supplemented with experimental performance results of representative stencil applications. Finally, an overview of the prototype \textsf{PASSION} (Parallel And Scalable Software for I/O) compiler is presented. This compiler takes an annotated out-of-core High Performance Fortran (HPF) program as input and generates the corresponding {\it node+message-passing} program with calls to the parallel I/O runtime library. We illustrate various functionalities of the compiler using example programs and supplement them by experimental results.} } @InProceedings{bornstein:reshuffle, author = {C. Bornstein and P. Steenkiste}, title = {Data Reshuffling in Support of Fast {I/O} For Distributed-Memory Machines}, booktitle = {Proceedings of the Third IEEE International Symposium on High Performance Distributed Computing}, year = {1994}, month = {August}, pages = {227--235}, keyword = {parallel I/O, distributed memory, pario-bib}, comment = {In a sense, this is about a two-phase technique for network I/O. They consider the problem of feeding a fast network interface (HIPPI) from a distributed-memory parallel machine (iWARP) in which the individual internal links are slower than the external network. So they get the processors to cooperate to reshuffle the data into a canonical layout that is convenient to send to the gateway node, and from there onto the external network.} } @InProceedings{bradley:ipsc2io, author = {David K. Bradley and Daniel A. Reed}, title = {Performance of the {Intel iPSC/2} Input/Output System}, booktitle = {Proceedings of the Fourth Conference on Hypercube Concurrent Computers and Applications}, year = {1989}, pages = {141--144}, publisher = {Golden Gate Enterprises, Los Altos, CA}, address = {Monterey, CA}, keyword = {hypercube, parallel I/O, Intel, pario-bib}, comment = {Some measurements and simulations of early CFS performance. Looks terrible, but they disclaim that it is a beta version of the first CFS. They determined that the disks are the bottleneck. But this may just imply that they need more disks. Their parallel synthetic applications had each process read a separate file. CFS had ridiculous traffic overhead. Again, this was beta CFS.} } @TechReport{brandwijn:dasd, author = {Alexandre Brandwajn}, title = {Performance Benefits of Parallelism in Cached {DASD} Controllers}, year = {1988}, month = {November}, number = {UCSC-CRL-88-30}, institution = {Computer Research Laboratory, UC Santa Cruz}, keyword = {parallel I/O, disk caching, disk architecture, pario-bib}, comment = {Some new DASD products with caches overlap cache hits with prefetch of remainder of track into cache. They use analytical model to evaluate performance of these. They find performance improvements of 5-15 percent under their assumptions.} } @InProceedings{brezany:HPF, author = {Peter Brezany and Michael Gernt and Piyush Mehotra and Hans Zima}, title = {Concurrent File Operations in a {High Performance FORTRAN}}, booktitle = {Proceedings of Supercomputing '92}, year = {1992}, pages = {230--237}, keyword = {supercomputing, fortran, multiprocessor file system interface, pario-bib}, comment = {Describing their way of writing arrays to files so that they are written in a fast, parallel way, and so that (if read in same distribution) they can be read fast and parallel. Normal read and write forces standard ordering, but cread and cwrite uses a compiler and runtime selected ordering, which is stored in the file so it can be used when rereading. Good for temp files.} } @InProceedings{brezany:architecture, author = {Peter Brezany and Thomas A. Mueck and Erich Schikuta}, title = {A Software Architecture for Massively Parallel Input-Output}, booktitle = {Third International Workshop PARA'96 (Applied Parallel Computing - Industrial Computation and Optimization)}, year = {1996}, month = {August}, series = {Lecture Notes in Computer Science}, volume = {1186}, pages = {85--96}, publisher = {Springer-Verlag}, address = {Lyngby, Denmark}, note = {Also available as Technical Report of the Inst. f.~Angewandte Informatik u. Informationssysteme, University of Vienna, TR~96202}, URL = {http://www.pri.univie.ac.at/~schiki/research/paper/para96/para96.ps}, keyword = {compiler transformations, runtime support, parallel I/O, prefetching, pario-bib}, abstract = {For an increasing number of data intensive scientific applications, parallel I/O concepts are a major performance issue. Tackling this issue, we provide an outline of an input/output system designed for highly efficient, scalable and conveniently usable parallel I/O on distributed memory systems. The main focus of this paper is the parallel I/O runtime system support provided for software-generated programs produced by parallelizing compilers in the context of High Performance FORTRAN efforts. Specifically, our design is presented in the context of the Vienna Fortran Compilation System.} } @InProceedings{brezany:compiling, author = {Peter Brezany and Thomas A. Mueck and Erich Schikuta}, title = {Mass Storage Support for a Parallelizing Compilation System}, booktitle = {International Conference Eurosim'96-- HPCN challenges in Telecomp and Telecom: Parallel Simulation of Complex Systems and Large Scale Applications}, year = {1996}, month = {June}, pages = {63--70}, publisher = {North-Holland, Elsevier Science}, address = {Delft, The Netherlands}, URL = {http://www.pri.univie.ac.at/~schiki/research/paper/eurosim96/eurosim96.ps}, keyword = {parallel I/O, high performance mass storage system, high performance languages, compilation techniques, data administration, pario-bib} } @InProceedings{brezany:io-support, author = {Peter Brezany and Thomas A. Mueck and Erich Schikuta}, title = {Language, Compiler and Parallel Database Support for {I/O} Intensive Applications}, booktitle = {Proceedings of the International Conference on High Performance Computing and Networking}, year = {1995}, month = {May}, series = {Lecture Notes in Computer Science}, volume = {919}, pages = {14--20}, publisher = {Springer-Verlag}, address = {Milan, Italy}, note = {also available as Technical Report of the Inst. f.~Software Technology and Parallel Systems, University of Vienna, TR95-8, 1995}, URL = {http://www.pri.univie.ac.at/~schiki/research/paper/techrep/tr95-8.ps}, keyword = {compiler transformations, runtime support, declustering, parallel I/O, pario-bib}, comment = {They describe some extensions to Vienna Fortran that support parallel I/O, and how they plan to extend the compiler and run-time system to help. They are somewhat short on details, however. The basic idea is that file declustering is based on hints from the compiler or programmer about how the file will be used, eg, as a matrix distributed in thus-and-so way.} } @Article{brezany:irregular, author = {P. Brezany and A. Choudhary and M. Dang}, title = {Parallelization of irregular out-of-core applications for distributed-memory systems}, journal = {High-Performance Computing and Networking}, year = {1997}, series = {Lecture Notes in Computer Science}, volume = {1225}, pages = {811--820}, publisher = {Springer-Verlag}, earlier = {brezany:irregular-tr}, keyword = {parallel I/O, out of core, compiler, library, pario-bib}, abstract = {Large scale irregular applications involve data arrays and other data structures that are too large to fit in main memory and hence reside on disks; such applications are called out-of-core applications. This paper presents techniques for implementing this kind of applications. In particular we present a design for a runtime system to efficiently support parallel execution of irregular out-of-core codes on distributed-memory systems. Furthermore, we describe the appropriate program transformations required to reduce the I/O overheads for staging data as well as for communication while maintaining load balance. The proposed techniques can be used by a parallelizing compiler or by users writing programs in node + message passing style. We have done a preliminary implementation of the techniques presented here. We introduce experimental results from a template CFD code to demonstrate the efficacy of the presented techniques.}, comment = {The authors present techniques for implementing large scale irregular out-of-core applications. The techniques they describe can be used by a parallel compiler (e.g., HPF and its extensions) or by users using message passing. The objectives of the proposed techniques are to ''to minimize I/O accesses in all steps while maintaining load balance and minimal communication''. They demonstrate the effectivness of their techniques by showing results from a Computational Fluid Dynamics (CFD) code.} } @TechReport{brezany:irregular-tr, author = {P. Brezany and A. Choudhary}, title = {Techniques and Optimizations for Developing Irregular Out-of-Core Applications on Distributed-Memory Systems}, year = {1996}, month = {November}, number = {96-4}, institution = {Institute for Software Technology and Parallel Systems, University of Vienna}, URL = {http://www.pri.univie.ac.at/~schiki/research/vipios/paper/brezany-choudhary.ps}, keyword = {parallel I/O, out of core, irregular applications, compiler, pario-bib} } @InProceedings{broom:acacia, author = {Bradley M. Broom},