BibTeX bibliography file: Parallel I/O Ongoing Edition Last updated: Sat Feb 22 14:27:43 EST 1997 This edition supercedes my older bibliographies. This bibliography is available on the WWW at http://www.cs.dartmouth.edu/pario/bib/ and by ftp at ftp://ftp.cs.dartmouth.edu/pub/pario/pario.bib Both of which are easily reached by the Parallel-I/O Archive at http://www.cs.dartmouth.edu/pario/ This bibliography covers parallel I/O, with a significant emphasis on file systems rather than, say, network or graphics I/O. This includes architecture, operating systems, some algorithms, some databases, and some workload characterization. Because of the expanding nature of this field, I cannot cover everything, and this bibliography is admittedly spotty on topics like disk arrays, parallel databases, and parallel networking. The entries are alphabetized by cite key. The emphasis is on including everything I have, rather than selecting a few key articles of interest. Thus, you probably don't want (or need) to read everything here. There are many repeated entries, in the sense that a paper is often published first as a TR, then in a conference, then in a journal. The "earlier" and "later" tags tie together versions of a paper. Except where noted, all comments are mine, and any opinions expressed there are mine only. In some cases I am simply restating the opinion or result obtained by the paper's authors, and thus even I might disagree with the statement. I keep most editorial comments to a minimum. Please send any additions or corrections (new abstracts and URLs would be great!) to parallel-io-bib@dartmouth.edu. Indeed, if you want to get updates to the bibliography (released once per week), subscribe to that mailing list by sending a message to majordomo@dartmouth.edu whose BODY says subscribe parallel-io-bib You may use the bibliography as you please except for publishing it as a whole, since the compilation is mine. Please leave this header on the collection; BibTeX won't mind. David Kotz Associate Professor Department of Computer Science Dartmouth College 6211 Sudikoff Laboratory Hanover, NH 03755-3510 USA URL: http://www.cs.dartmouth.edu/~dfk/ 603-646-1439 @string {email = "dfk@cs.dartmouth.edu"} % have to hide this from bibtex % BibTeX bibliography file @InProceedings{abali:ibm370, author = {B\"{u}lent Abali and Bruce D. Gavril and Richard W. Hadsell and Linh Lam and Brion Shimamoto}, title = {{Many/370: A} Parallel Computer Prototype for {I/O} Intensive Applications}, booktitle = {Proceedings of the Sixth Annual Distributed-Memory Computer Conference}, year = {1991}, pages = {728--730}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, comment = {Describes a parallel IBM/370, where they attach several small 370s to a switch, and several disks to each 370. Not much in the way of striping.} } @Article{abu-safah:speedup, author = {Walid Abu-Safah and Harlan Husmann and David Kuck}, title = {On {Input/Output} Speed-up in Tightly-coupled Multiprocessors}, journal = {IEEE Transactions on Computers}, year = {1986}, pages = {520--530}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, I/O, pario-bib}, comment = {Derives formulas for the speedup with and without I/O considered and with parallel software and hardware format conversion. Considering I/O gives a more optimistic view of the speedup of a program {\em assuming} that the parallel version can use its I/O bandwidth as effectively as the serial processor. Concludes that, for a given number of processors, increasing the I/O bandwidth is the most effective way to speed up the program (over the format conversion improvements).} } @InProceedings{acharya:tuning, author = {Anurag Acharya and Mustafa Uysal and Robert Bennett and Assaf Mendelson and Michael Beynon and Jeffrey K. Hollingsworth and Joel Saltz and Alan Sussman}, title = {Tuning the Performance of {I/O} Intensive Parallel Applications}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {15--27}, publisher = {ACM Press}, address = {Philadelphia}, keyword = {parallel I/O, filesystem workload, parallel application, pario-bib}, abstract = {Getting good I/O performance from parallel programs is a critical problem for many application domains. In this paper, we report our experience tuning the I/O performance of four application programs from the areas of satellite-data processing and linear algebra. After tuning, three of the four applications achieve application-level I/O rates of over 100 MB/s on 16 processors. The total volume of I/O required by the programs ranged from about 75 MB to over 200 GB. We report the lessons learned in achieving high I/O performance from these applications, including the need for code restructuring, local disks on every node and knowledge of future I/O requests. We also report our experience on achieving high performance on peer-to-peer configurations. Finally, we comment on the necessity of complex I/O interfaces like collective I/O and strided requests to achieve high performance.} } @Article{aggarwal:sorting, author = {Alok Aggarwal and Jeffrey Scott Vitter}, title = {The Input/Output Complexity of Sorting and Related Problems}, journal = {Communications of the ACM}, year = {1988}, month = {September}, volume = {31}, number = {9}, pages = {1116--1127}, URL = {ftp://cs.duke.edu/pub/jsv/Papers/AV.IO.ps.Z}, keyword = {parallel I/O, sorting, pario-bib}, abstract = {We provide tight upper and lower bounds, up to a constant factor, for the number of inputs and outputs~(I/Os) between internal memory and secondary storage required for five sorting-related problems: sorting, the fast Fourier transform (FFT), permutation networks, permuting, and matrix transposition. The bounds hold both in the worst case and in the average case, and in several situations the constant factors match. Secondary storage is modeled as a magnetic disk capable of transfering $P$~blocks each containing $B$~records in a single time unit; the records in each block must be input from or output to $B$~contiguous locations on the disk. We give two optimal algorithms for the problems, which are variants of merge sorting and distribution sorting. In particular we show for $P=1$ that the standard merge sorting algorithm is an optimal external sorting method, up to a constant factor in the number of~I/Os. Our sorting algorithms use the same number of~I/Os as does the permutation phase of key sorting, except when the internal memory size is extremely small, thus affirming the popular adage that key sorting is not faster. We also give a simpler and more direct derivation of Hong and Kung's lower bound for the FFT for the special case $B = P = O(1)$.}, comment = {Good comments on typical external sorts, and how big they are. Focuses on parallelism at the disk. They give tight theoretical bounds on the number of I/O's required to do external sorting and other problems (FFTs, matrix transpose, etc.). If $x$ is the number of blocks in the file and $y$ is the number of blocks that fit in memory, then the number of I/Os needed grows as $\Theta (x \log x / \log y)$. If parallel transfers of $p$ blocks are allowed, speedup linear in $p$ is obtained.} } @InProceedings{agrawal:asynch, author = {Gagan Agrawal and Anurag Acharya and Joel Saltz}, title = {An Interprocedural Framework for Placement of Asynchronous {I/O} Operations}, booktitle = {Proceedings of the 10th ACM International Conference on Supercomputing}, year = {1996}, month = {May}, pages = {358--365}, publisher = {ACM Press}, address = {Philadelphia, PA}, keyword = {compiler, I/O, pario-bib}, comment = {Not really about parallel applications or parallel I/O, but I think it may be of interest to that community. They propose a compiler framework for a compiler to insert asynchronous I/O operations (start I/O, finish I/O), to satisfy the dependency constraints of the program.} } @InProceedings{alverson:tera, author = {Robert Alverson and David Callahan and Daniel Cummings and Brian Koblenz and Allan Porterfield and Burton Smith}, title = {The {Tera} Computer System}, booktitle = {Proceedings of the 1990 ACM International Conference on Supercomputing}, year = {1990}, pages = {1--6}, keyword = {parallel architecture, MIMD, NUMA, pario-bib}, comment = {Interesting architecture. 3-d mesh of pipelined packet-switch nodes, e.g., 16x16x16 is 4096 nodes, with 256 procs, 512 memory units, 256 I/O cache units, and 256 I/O processors attached. 2816 remaining nodes are just switching nodes. Each processor is 64-bit custom chip with up to 128 simultaneous threads in execution. It alternates between ready threads, with a deep pipeline. Inter-instruction dependencies explicitly encoded by the compiler, stalling those threads until the appropriate time. Each thread has a complete set of registers! Memory units have 4-bit tags on each word, for full/empty and trap bits. Shared memory across the network: ``The Tera ISP-level architecture is UMA, even though the PMS-level architecture is NUMA. Put another way, the memory looks a single cycle away to the compiler writer.'' -- Burton Smith. See also tera:brochure.} } @InProceedings{ap:enwrich, author = {Apratim Purakayastha and Carla Schlatter Ellis and David Kotz}, title = {{ENWRICH:} A Compute-Processor Write Caching Scheme for Parallel File Systems}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {55--68}, publisher = {ACM Press}, address = {Philadelphia}, earlier = {ap:enwrich-tr}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/ap:enwrich.ps.Z}, keyword = {parallel file system, parallel I/O, caching, pario-bib, dfk}, abstract = {Many parallel scientific applications need high-performance I/O. Unfortunately, end-to-end parallel-I/O performance has not been able to keep up with substantial improvements in parallel-I/O hardware because of poor parallel file-system software. Many radical changes, both at the interface level and the implementation level, have recently been proposed. One such proposed interface is {\em collective I/O}, which allows parallel jobs to request transfer of large contiguous objects in a single request, thereby preserving useful semantic information that would otherwise be lost if the transfer were expressed as per-processor non-contiguous requests. Kotz has proposed {\em disk-directed I/O} as an efficient implementation technique for collective-I/O operations, where the compute processors make a single collective data-transfer request, and the I/O processors thereafter take full control of the actual data transfer, exploiting their detailed knowledge of the disk-layout to attain substantially improved performance. \par Recent parallel file-system usage studies show that writes to write-only files are a dominant part of the workload. Therefore, optimizing writes could have a significant impact on overall performance. In this paper, we propose ENWRICH, a compute-processor write-caching scheme for write-only files in parallel file systems. ENWRICH combines low-overhead write caching at the compute processors with high performance disk-directed I/O at the I/O processors to achieve both low latency and high bandwidth. This combination facilitates the use of the powerful disk-directed I/O technique independent of any particular choice of interface. By collecting writes over many files and applications, ENWRICH lets the I/O processors optimize disk I/O over a large pool of requests. We evaluate our design via simulated implementation and show that ENWRICH achieves high performance for various configurations and workloads.} } @TechReport{ap:enwrich-tr, author = {Apratim Purakayastha and Carla Schlatter Ellis and David Kotz}, title = {{ENWRICH:} A Compute-Processor Write Caching Scheme for Parallel File Systems}, year = {1995}, month = {October}, number = {CS-1995-22}, institution = {Dept. of Computer Science, Duke University}, later = {ap:enwrich}, URL = {ftp://ftp.cs.duke.edu/dist/techreport/1995/1995-22.ps.gz}, keyword = {parallel file system, parallel I/O, caching, pario-bib, dfk}, abstract = {Many parallel scientific applications need high-performance I/O. Unfortunately, end-to-end parallel-I/O performance has not been able to keep up with substantial improvements in parallel-I/O hardware because of poor parallel file-system software. Many radical changes, both at the interface level and the implementation level, have recently been proposed. One such proposed interface is {\em collective I/O}, which allows parallel jobs to request transfer of large contiguous objects in a single request, thereby preserving useful semantic information that would otherwise be lost if the transfer were expressed as per-processor non-contiguous requests. Kotz has proposed {\em disk-directed I/O} as an efficient implementation technique for collective-I/O operations, where the compute processors make a single collective data-transfer request, and the I/O processors thereafter take full control of the actual data transfer, exploiting their detailed knowledge of the disk-layout to attain substantially improved performance. \par Recent parallel file-system usage studies show that writes to write-only files are a dominant part of the workload. Therefore, optimizing writes could have a significant impact on overall performance. In this paper, we propose ENWRICH, a compute-processor write-caching scheme for write-only files in parallel file systems. ENWRICH combines low-overhead write caching at the compute processors with high performance disk-directed I/O at the I/O processors to achieve both low latency and high bandwidth. This combination facilitates the use of the powerful disk-directed I/O technique independent of any particular choice of interface. By collecting writes over many files and applications, ENWRICH lets the I/O processors optimize disk I/O over a large pool of requests. We evaluate our design via simulated implementation and show that ENWRICH achieves high performance for various configurations and workloads.} } @PhdThesis{ap:thesis, author = {Apratim Purakayastha}, title = {Characterizing and Optimizing Parallel File Systems}, year = {1996}, month = {June}, school = {Dept. of Computer Science, Duke University}, address = {Durham, NC}, note = {Also available as technical report CS-1996-10}, URL = {ftp://ftp.cs.duke.edu/dist/techreport/1996/1996-10.ps.gz}, keyword = {parallel I/O, multiprocessor file system, file access patterns, workload characterization, file caching, disk-directed I/O, pario-bib}, abstract = {High-performance parallel file systems are needed to satisfy tremendous I/O requirements of parallel scientific applications. The design of such parallel file systems depends on a comprehensive understanding of the expected workload, but so far there have been very few usage studies of multiprocessor file systems. In the first part of this dissertation, we attempt to fill this void by measuring a real file-system workload on a production parallel machine, namely the CM-5 at the National Center for Supercomputing Applications. We collect information about nearly every individual I/O request from the mix of jobs running on the machine. Analysis of the traces leads to various recommendations for design of future parallel file systems. Our usage study showed that writes to write-only files are a dominant part of the workload. Therefore, optimizing writes could have a significant impact on overall performance. In the second part of this dissertation, we propose ENWRICH, a compute-processor write-caching scheme for write-only files in parallel file systems. Within its framework, ENWRICH uses a recently proposed high performance implementation of collective I/O operations called disk-directed I/O, but it eliminates a number of limitations of disk-directed I/O. ENWRICH combines low-overhead write caching at the compute processors with high performance disk-directed I/O at the I/O processors to achieve both low latency and high bandwidth. This combination facilitates the use of the powerful disk-directed I/O technique independent of any particular choice of interface, and without the requirement for mapping libraries at the I/O processors. By collecting writes over many files and applications, ENWRICH lets the I/O processors optimize disk I/O over a large pool of requests. We evaluate our design of ENWRICH using simulated implementation and extensive experimentation. We show that ENWRICH achieves high performance for various configurations and workloads. We pinpoint the reasons for ENWRICH`s failure to perform well for certain workloads, and suggest possible enhancements. Finally, we discuss the nuances of implementing ENWRICH on a real platform and speculate about possible adaptations of ENWRICH for emerging multiprocessing platforms.}, comment = {See also ap:enwrich, ap:workload, and nieuwejaar:workload} } @InProceedings{ap:workload, author = {Apratim Purakayastha and Carla Schlatter Ellis and David Kotz and Nils Nieuwejaar and Michael Best}, title = {Characterizing Parallel File-Access Patterns on a Large-Scale Multiprocessor}, booktitle = {Proceedings of the Ninth International Parallel Processing Symposium}, year = {1995}, month = {April}, pages = {165--172}, earlier = {ap:workload-tr}, later = {nieuwejaar:workload-tr}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/ap:workload.ps.Z}, keyword = {parallel I/O, file access pattern, multiprocessor file system, file system workload, dfk, pario-bib}, abstract = {High-performance parallel file systems are needed to satisfy tremendous I/O requirements of parallel scientific applications. The design of such high-performance parallel file systems depends on a comprehensive understanding of the expected workload, but so far there have been very few usage studies of multiprocessor file systems. This paper is part of the CHARISMA project, which intends to fill this void by measuring real file-system workloads on various production parallel machines. In particular, here we present results from the CM-5 at the National Center for Supercomputing Applications. Our results are unique because we collect information about nearly every individual I/O request from the mix of jobs running on the machine. Analysis of the traces leads to various recommendations for parallel file-system design.}, comment = {See also kotz:workload, nieuwejaar:strided.} } @TechReport{ap:workload-tr, author = {Apratim Purakayastha and Carla Schlatter Ellis and David Kotz and Nils Nieuwejaar and Michael Best}, title = {Characterizing Parallel File-Access Patterns on a Large-Scale Multiprocessor}, year = {1994}, month = {October}, number = {CS-1994-33}, institution = {Dept. of Computer Science, Duke University}, later = {ap:workload}, URL = {ftp://ftp.cs.duke.edu/pub/dist/techreport/1994/1994-33.ps.Z}, keyword = {parallel I/O, file access pattern, multiprocessor file system, file system workload, dfk, pario-bib}, abstract = {Rapid increases in the computational speeds of multiprocessors have not been matched by corresponding performance enhancements in the I/O subsystem. To satisfy the large and growing I/O requirements of some parallel scientific applications, we need parallel file systems that can provide high-bandwidth and high-volume data transfer between the I/O subsystem and thousands of processors. \par Design of such high-performance parallel file systems depends on a thorough grasp of the expected workload. So far there have been no comprehensive usage studies of multiprocessor file systems. Our CHARISMA project intends to fill this void. The first results from our study involve an iPSC/860 at NASA Ames. This paper presents results from a different platform, the CM-5 at the National Center for Supercomputing Applications. The CHARISMA studies are unique because we collect information about every individual read and write request and about the entire mix of applications running on the machines. \par The results of our trace analysis lead to recommendations for parallel file system design. First, the file system should support efficient concurrent access to many files, and I/O requests from many jobs under varying load condit ions. Second, it must efficiently manage large files kept open for long periods. Third, it should expect to see small requests, predominantly sequential access patterns, application-wide synchronous access, no concurrent file-sharing between jobs, appreciable byte and block sharing between processes within jobs, and strong interprocess locality. Finally, the trace data suggest that node-level write caches and collective I/O request interfaces may be useful in certain environments.}, comment = {See also kotz:workload, nieuwejaar:strided.} } @TechReport{arendt:genome, author = {James W. Arendt}, title = {Parallel Genome Sequence Comparison Using a Concurrent File System}, year = {1991}, number = {UIUCDCS-R-91-1674}, institution = {University of Illinois at Urbana-Champaign}, keyword = {parallel file system, parallel I/O, Intel iPSC/2, pario-bib}, comment = {Studies the performance of Intel CFS. Uses an application that reads in a huge file of records, each a genome sequence, and compares each sequence against a given sequence. Looks at cache performance, message latency, cost of prefetches and directory reads, and throughput. He tries one-disk, one-proc transfer rates. Because of contention with the directory server on one of the two I/O nodes, it was faster to put all of the file on the other I/O node. Striping is good for multiple readers. Best access pattern was interleaved, not segmented or separate files, because it avoided disk seeks. Perhaps the files are stored contiguously? Can get good speedup by reading the sequences in big integral record sizes, from CFS, using a load-balancing scheduled by the host. Contention for directory blocks -- through single-node directory server.} } @InProceedings{armen:disk-model, author = {Chris Armen}, title = {Bounds on the Separation of Two Parallel Disk Models}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {122--127}, publisher = {ACM Press}, address = {Philadelphia}, keyword = {parallel I/O, theory, parallel I/O algorithm, pario-bib}, abstract = {The single-disk, D-head model of parallel I/O was introduced by Agarwal and Vitter to analyze algorithms for problem instances that are too large to fit in primary memory. Subsequently Vitter and Shriver proposed a more realistic model in which the disk space is partitioned into D disks, with a single head per disk. To date, each problem for which there is a known optimal algorithm for both models has the same asymptotic bounds on both models. Therefore, it has been unknown whether the models are equivalent or whether the single-disk model is strictly more powerful. \par In this paper we provide evidence that the single-disk model is strictly more powerful. We prove a lower bound on any general simulation of the single-disk model on the multi-disk model and establish randomized and deterministic upper bounds. Let $N$ be the problem size and let $T$ be the number of parallel I/Os required by a program on the single-disk model. Then any simulation of this program on the multi-disk model will require $\Omega\left(T \frac{\log(N/D)}{\log \log(N/D)}\right)$ parallel I/Os. This lower bound holds even if replication is allowed in the multi-disk model. We also show an $O\left(\frac{\log D}{\log \log D}\right)$ randomized upper bound and an $O\left(\log D (\log \log D)^2\right)$ deterministic upper bound. These results exploit an interesting analogy between the disk models and the PRAM and DCM models of parallel computation.} } @InProceedings{arunachalam:prefetch, author = {Meenakshi Arunachalam and Alok Choudhary and Brad Rullman}, title = {A Prefetching Prototype for the Parallel File System on the {Paragon}}, booktitle = {Proceedings of the 1995 ACM Sigmetrics Conference on Measurement and Modeling of Computer Systems}, year = {1995}, month = {May}, pages = {321--323}, note = {Extended Abstract}, keyword = {parallel I/O, prefetching, parallel file system, pario-bib}, comment = {A related paper is to appear in IPPS'96.} } @InProceedings{arunachalam:prefetch2, author = {Meenkashi Arunachalam and Alok Choudhary and Brad Rullman}, title = {Implementation and evaluation of prefetching in the {Intel Paragon Parallel File System}}, booktitle = {Proceedings of the Tenth International Parallel Processing Symposium}, year = {1996}, month = {April}, pages = {554--559}, URL = {http://www.ece.nwu.edu/~meena/papers/ipps.ps}, keyword = {parallel I/O, prefetching, multiprocessor file system, pario-bib}, abstract = {The significant difference between the speeds of the I/O system (e.g., disks) and compute processors in parallel systems creates a bottleneck that lowers the performance of an application that does a considerable amount of disk accesses. A major portion of the compute processors' time is wasted on waiting for I/O to complete. This problem can be addressed to a certain extent, if the necessary data can be fetched from the disk before the I/O call to the disk is issued. Fetching data ahead of time, known as prefetching in a multiprocessor environment depends a great deal on the application's access pattern. The subject of this paper is implementation and performance evaluation of a prefetching prototype in a production parallel file system on the Intel Paragon. Specifically, this paper presents a) design and implementation of a prefetching strategy in the parallel file system and b) performance measurements and evaluation of the file system with and without prefetching. The prototype is designed at the operating system level for the PFS. It is implemented in the PFS subsystem of the Intel Paragon Operating System. It is observed that in many cases prefetching provides considerable performance improvements. In some other cases no improvements or some performance degradation is observed due to the overheads incurred in prefetching.}, comment = {See arunachalam:prefetch.} } @InProceedings{asbury:fortranio, author = {Raymond K. Asbury and David S. Scott}, title = {{FORTRAN} {I/O} on the {iPSC/2}: Is there read after write?}, booktitle = {Proceedings of the Fourth Conference on Hypercube Concurrent Computers and Applications}, year = {1989}, pages = {129--132}, publisher = {Golden Gate Enterprises, Los Altos, CA}, address = {Monterey, CA}, keyword = {parallel I/O, hypercube, Intel iPSC/2, file access pattern, pario-bib} } @InProceedings{asthana:active, author = {Abhaya Asthana and Mark Cravatts and Paul Krzyzanowski}, title = {An Experimental Active Memory Based {I/O} Subsystem}, booktitle = {Proceedings of the IPPS~'94 Workshop on Input/Output in Parallel Computer Systems}, year = {1994}, month = {April}, pages = {73--84}, organization = {AT\&T Bell Labs}, note = {Also appeared in Computer Architecture News 22(4)}, later = {asthana:active-book}, keyword = {parallel I/O, architecture, pario-bib}, comment = {They describe an I/O subsystem based on an ``active memory'' called SWIM (Structured Wafer-based Intelligent Memory). SWIM chips are RAM chips with some built-in processing. The idea is that these tiny processors can manipulate the data in the chip at full speed, without dealing with memory bus or off-chip costs. Further, the chips can work in parallel. They demonstrate how they've used this to build a national phone database server, a high-performance IP router, and a call-screening agent.} } @InCollection{asthana:active-book, author = {Abhaya Asthana and Mark Cravatts and Paul Krzyzanowski}, title = {An Experimental Memory-based {I/O} Subsystem}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {17}, editor = {Ravi Jain and John Werth and James C. Browne}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {373--390}, publisher = {Kluwer Academic Publishers}, earlier = {asthana:active}, keyword = {parallel I/O architecture, pario-bib}, abstract = {We describe an I/O subsystem based on an active memory named SWIM (Structured Wafer-based Intelligent Memory) designed for efficient storage and manipulation of data structures. The key architectural idea in SWIM is to associate some processing logic with each memory chip that allows it to perform data manipulation operations locally and to communicate with a disk or a communication line through a backend port. The processing logic is specially designed to perform operations such as pointer dereferencing, memory indirection, searching and bounds checking efficiently. The I/O subsystem is built using an interconnected ensemble of such memory logic pairs. A complex processing task can now be distributed between a large number of small memory processors each doing a sub-task, while still retaining a common locus of control in the host CPU for higher level administrative and provisioning functions. We argue that active memory based processing enables more powerful, scalable and robust designs for storage and communications subsystems, that can support emerging network services, multimedia workstations and wireless PCS systems. A complete parallel hardware and software system constructed using an array of SWIM elements has been operational for over a year. We present results from application of SWIM to three network functions: a national phone database server, a high performance IP router, and a call screening agent.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @InProceedings{avalani:channels, author = {Bhavan Avalani and Alok Choudhary and Ian Foster and Rakesh Kirshnaiyer}, title = {Integrating Task and Data Parallelism Using Parallel {I/O} Techniques}, booktitle = {Proceedings of the International Workshop on Parallel Processing}, year = {1994}, month = {December}, address = {Bangalore, India}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/task_data.ps.Z}, keyword = {parallel I/O, pario-bib}, comment = {They describe using the techniques of delrosario and debenedictis (although without mentioning them) to provide for channels (parallel pipes) between independent data-parallel tasks. The technique really is the same as in debenedictus and delrosario, although they extend it a bit to allow multiple "files" within a channel (why not use multiple channels)? Also, they depend on the program to read and write synchronization variables to control access to the flow of data through the channel. While this may provide good performance in some cases, why not have support for automatic flow control? The system can detect when a portion of the channel is written, and release readers waiting on that portion of the channel (if any). The paper is a bit confusing in its use of the word "file", which seems to be used to mean different things at different points. Also, they seem to use an arbitrary distribution for the "file", which may or may not be the same as one of those used by the two endpoints.} } @TechReport{bagrodia:sio-character, author = {Rajive Bagrodia and Andrew Chien and Yarson Hsu and Daniel Reed}, title = {Input/Output: Instrumentation, Characterization, Modeling and Management Policy}, year = {1994}, number = {CCSF-41}, institution = {Scalable I/O Initiative}, address = {Caltech Concurrent Supercomputing Facilities, Caltech}, URL = {http://www.ccsf.caltech.edu/SIO/SIO_perf.ps}, keyword = {parallel I/O, pario-bib, prefetching, caching, multiprocessor file system, file access pattern}, comment = {Basically there are two parts to this paper. First, they will instrument applications, Intel PFS, and IBM Vesta, to trace I/O-related activity. Then they'll use Pablo to analyze and characterize. They plan to trace some events in detail, and the rest with histogram counters. Second, they plan to develop caching and prefetching policies and to analyze those with simulation, analysis, and implementation. They note that IBM and Intel are developing parallel I/O architecture simulators. See also poole:sio-survey, choudhary:sio-language, bershad:sio-os.} } @InProceedings{baird:disa, author = {R. Baird and S. Karamooz and H. Vazire}, title = {Distributed Information Storage Architecture}, booktitle = {Proceedings of the Twelfth IEEE Symposium on Mass Storage Systems}, year = {1993}, pages = {145--155}, keyword = {parallel I/O, distributed file system, mass storage, pario-bib}, comment = {Architecture for distributed information storage. Integrates file systems, databases, etc. Single system image, lots of support for administration. O-O model, with storage device objects, logical device objects, volume objects, and file objects. Methods for each type of object, including administrative methods.} } @InProceedings{baldwin:hyperfs, author = {C. H. Baldwin and W. C. Nestlerode}, title = {A Large Scale File Processing Application on a Hypercube}, booktitle = {Proceedings of the Fifth Annual Distributed-Memory Computer Conference}, year = {1990}, pages = {1400-1404}, keyword = {multiprocessor file system, file access pattern, parallel I/O, hypercube, pario-bib}, comment = {Census-data processing on an nCUBE/10 at USC. Their program uses an interleaved pattern, which is like my lfp or gw with multi-record records (i.e., the application does its own blocking). Shifted to asynchronous I/O to do OBL manually. Better results if they did more computation per I/O (of course).} } @TechReport{barak:hfs, author = {Amnon Barak and Bernard A. Galler and Yaron Farber}, title = {A Holographic File System for a Multicomputer with Many Disk Nodes}, year = {1988}, month = {May}, number = {88-6}, institution = {Dept. of Computer Science, Hebrew University of Jerusalem}, keyword = {parallel I/O, hashing, reliability, disk mirroring, pario-bib}, comment = {Describes a file system for a distributed system that scatters records of each file over many disks using hash functions. The hash function is known by all processors, so no one processor must be up to access the file. Any portion of the file whose disknode is available may be accessed. Shadow nodes are used to take over for nodes that go down, saving the info for later use by the proper node. Intended to easily parallelize read/write accesses and global file operations, and to increase file availability.} } @Article{batcher:staran, author = {K. E. Batcher}, title = {{STARAN} Parallel Processor System Hardware}, journal = {AFIPS Conference Proceedings}, year = {1974}, pages = {405--410}, keyword = {parallel architecture, array processor, parallel I/O, SIMD, pario-bib}, comment = {This paper is reproduced in Kuhn and Padua's (1981, IEEE) survey ``Tutorial on Parallel Processing.'' The STARAN is an array processor that uses Multi-Dimensional-Access (MDA) memories and permutation networks to access data in bit slices in a variety of ways, with high-speed I/O capabilities. Its router (called the {\em flip} network) could permute data among the array processors, or between the array processors and external devices, including disks, video input, and displays.} } @InProceedings{baylor:methodology, author = {Sandra Johnson Baylor and Caroline Benveniste and Leo J. Beolhouwer}, title = {A Methodology for Evaluating Parallel {I/O} Performance for Massively Parallel Processors}, booktitle = {Proceedings of the 27th Annual Simulation Symposium}, year = {1994}, month = {April}, pages = {31--40}, keyword = {parallel I/O, parallel architecture, simulation, pario-bib} } @InProceedings{baylor:perfeval, author = {Sandra Johnson Baylor and Caroline B. Benveniste and Yarson Hsu}, title = {Performance Evaluation of a Parallel {I/O} Architecture}, booktitle = {Proceedings of the 9th ACM International Conference on Supercomputing}, year = {1995}, month = {July}, pages = {404--413}, publisher = {ACM Press}, address = {Barcelona}, earlier = {baylor:perfeval-tr}, keyword = {performance evaluation, parallel architecture, parallel I/O, pario-bib}, comment = {They use a simulator to evaluate the performance of a parallel I/O system. They simulate the network and disks under a synthetic workload, and measure the time it takes for I/O requests to traverse the network, be processed, and return. They also measure the impact of I/O requests on non-I/O messages. Their results are fairly unsurprising.} } @TechReport{baylor:perfeval-tr, author = {Sandra Johnson Baylor and Caroline B. Benveniste and Yarson Hsu}, title = {Performance Evaluation of a Parallel {I/O} Architecture}, year = {1995}, month = {May}, number = {RC~20049}, institution = {IBM T.~J. Watson Research Center}, later = {baylor:perfeval}, keyword = {performance evaluation, parallel architecture, parallel I/O, pario-bib} } @InProceedings{baylor:vulcan-perf, author = {Sandra Johnson Baylor and Caroline Benveniste and Yarsun Hsu}, title = {Performance Evaluation of a Massively Parallel {I/O} Subsystem}, booktitle = {Proceedings of the IPPS~'94 Workshop on Input/Output in Parallel Computer Systems}, year = {1994}, pages = {1--15}, organization = {IBM Watson Research Center}, note = {Also appeared in Computer Architecture News 22(4)}, later = {baylor:vulcan-perf-book}, keyword = {parallel I/O, parallel architecture, performance analysis, pario-bib}, comment = {See polished version baylor:vulcan-perf-book. Simulation of the I/O architecture for the Vulcan MPP at IBM TJW. This is a distributed-memory MIMD system with a bidirectional omega-type interconnection network, and separate compute and I/O nodes. They use a stochastic workload to evaluate the average I/O performance under a few different situations, and then use that average performance, along with a stochastic workload, in a detailed simulation of the interconnection network. (What would be the effect of adding variance to the I/O-node performance?) A key point is that the I/O node will not accept any more requests until a current write request is finished being processed (copied into the write-back cache). If there are many writes, this can backup the network (would a different write-request protocol help?) Not clear how concurrency of reads are modeled. Results show that network saturates for high request rates and small number of I/O nodes. As request rate decreases or number of I/O nodes increases, performance levels off to a reasonable value. Placement of I/O nodes didn't make much difference, nor did extra non-I/O traffic. Given their parameters, and for reasonable loads, 1 I/O node per 4 compute nodes was a reasonable balance, and was scalable.} } @InCollection{baylor:vulcan-perf-book, author = {Sandra Johnson Baylor and Caroline Benveniste and Yarsun Hsu}, title = {Performance Evaluation of a Massively Parallel {I/O} Subsystem}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {13}, editor = {Ravi Jain and John Werth and James C. Browne}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {293--311}, publisher = {Kluwer Academic Publishers}, earlier = {baylor:vulcan-perf}, keyword = {parallel I/O architecture, performance evaluation, pario-bib}, abstract = {Presented are the trace-driven simulation results of a study conducted to evaluate the performance of the internal parallel I/O subsystem of the Vulcan massively parallel processor (MPP) architecture. The system sizes evaluated vary from 16 to 512 nodes. The results show that a compute node to I/O node ratio of four is the most cost effective for all system sizes, suggesting high scalability. Also, processor-to-processor communication effects are negligible for small message sizes and the greater the fraction of I/O reads, the better the I/O performance. Worse case I/O node placement is within 13\% of more efficient placement strategies. Introducing parallelism into the internal I/O subsystem improves I/O performance significantly.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @InProceedings{baylor:workload, author = {Sandra Johnson Baylor and C. Eric Wu}, title = {Parallel {I/O} Workload Characteristics Using {Vesta}}, booktitle = {Proceedings of the IPPS~'95 Workshop on Input/Output in Parallel and Distributed Systems}, year = {1995}, month = {April}, pages = {16--29}, later = {baylor:workload-book}, URL = {http://www.research.ibm.com:8080/PS/155.ps.gz}, keyword = {parallel I/O, workload characterization, pario-bib}, abstract = {In recent years, the design and performance evaluation of parallel processors has focused on the processor, memory and communication subsystems. As a result, these subsystems have better performance potential than the I/O subsystem. In fact, the I/O subsystem is the bottleneck in many machines. However, there are a number of studies currently underway to improve the design of parallel I/O subsystems. To develop optimal parallel I/O subsystem designs, one must have a thorough understanding of the workload characteristics of parallel I/O and its exploitation of the associated parallel file system. Presented are the results of a study conducted to analyze the parallel I/O workloads of several applications on a parallel processor using the Vesta parallel file system. Traces of the applications are obtained to collect system events, communication events, and parallel I/O events. The traces are then analyzed to determine workload characteristics. The results show I/O request rates on the order of hundreds of requests per second, a large majority of requests are for small amount of data (less than 1500 bytes), a few requests are for large amounts of data (on the order of megabytes), significant file sharing among processes within a job, and strong temporal, traditional spatial, and interprocess spatial locality.}, comment = {See polished version baylor:workload-book. They characterize four parallel applications: sort, matrix multiply, seismic migration, and video server, in terms of their I/O activity. They found results that are consistent with kotz:workload, in that they also found lots of small data requests, some large data requests, significant file sharing and interprocess locality. This study found less of the non-contiguous access than did kotz:workload, because of the logical views provided by Vesta. Note on-line postscript does not include figures.} } @InCollection{baylor:workload-book, author = {Sandra Johnson Baylor and C. Eric Wu}, title = {Parallel {I/O} Workload Characteristics Using {Vesta}}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {7}, editor = {Ravi Jain and John Werth and James C. Browne}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {167--185}, publisher = {Kluwer Academic Publishers}, earlier = {baylor:workload}, keyword = {parallel I/O, file access pattern, workload characterization, file system workload, pario-bib}, abstract = {To develop optimal parallel I/O subsystems, one must have a thorough understanding of the workload characteristics of parallel I/O and its exploitation of the associated parallel file system. Presented are the results of a study conducted to analyze the parallel I/O workloads of several applications on a parallel processor using the Vesta parallel file system. Traces of the applications are obtained to collect system events, communication events, and parallel I/O events. The traces are then analyzed to determine workload characteristics. The results show I/O request rates on the order of hundreds of requests per second, a large majority of requests are for small amounts of data (less than 1500 bytes), a few requests are for large amounts of data (on the order of megabytes), significant file sharing among processes within a job, and strong temporal, traditional spatial, and interprocess spatial locality.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @Manual{bbn:admin, key = {BBN}, author = {BBN Advanced {Computers Inc.}}, title = {{TC2000} System Administration Guide}, edition = {Revision 3.0}, year = {1991}, month = {April}, keyword = {BBN, parallel I/O, pario-bib}, comment = {Administrative manual for the TC2000 I/O system. Can stripe over partitions in a user-specified set of disks. Large requests automatically split and done in parallel. See also garber:tc2000.} } @TechReport{becher:ooc-solver, author = {Jonathan D. Becher and John F. Porter}, title = {Out of Core Dense Solvers for the {MasPar} Parallel Computer}, year = {1994}, number = {MP/IP/SP-37.94}, institution = {MasPar Computer Corporation}, keyword = {parallel I/O, scientific computing, linear algebra, pario-bib}, comment = {They look at out-of-core block and slab solvers for the Maspar. They overlap reading one block with the computation of the previous block. They solve matrices up to 40k x 40k, and obtain 3.14 GFlops even with I/O considered.} } @InProceedings{bell:physics, author = {Jean L. Bell}, title = {A Specialized Data Management System for Parallel Execution of Particle Physics Codes}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data}, year = {1988}, pages = {277--285}, publisher = {ACM Press}, address = {Chicago, IL}, keyword = {file access pattern, disk prefetch, file system, pario-bib}, comment = {A specialized database system for particle physics codes. Valuable for its description of access patterns and subsequent file access requirements. Particle-in-cell codes iterate over timesteps, updating the position of each particle, and then the characteristics of each cell in the grid. Particles may move from cell to cell. Particle update needs itself and nearby gridcell data. The whole dataset is too big for memory, and each timestep must be stored on disk for later analysis anyway. Regular file systems are inadequate: specialized DBMS is more appropriate. Characteristics needed by their application class: multidimensional access (by particle type or by location, i.e., multiple views of the data), coordination between grid and particle data, coordination between processors, coordinated access to meta-data, inverted files, horizontal clustering, large blocking of data, asynchronous I/O, array data, complicated joins, and prefetching according to user-prespecified order. Note that many of these things can be provided by a file system, but that most are hard to come by in typical file systems, if not impossible. Many of these features are generalizable to other applications.} } @InProceedings{benner:pargraphics, author = {Robert E. Benner}, title = {Parallel Graphics Algorithms on a 1024-Processor Hypercube}, booktitle = {Proceedings of the Fourth Conference on Hypercube Concurrent Computers and Applications}, year = {1989}, pages = {133--140}, publisher = {Golden Gate Enterprises, Los Altos, CA}, address = {Monterey, CA}, keyword = {hypercube, graphics, parallel algorithm, parallel I/O, pario-bib}, comment = {About using the nCUBE/10's RT Graphics System. They were frustrated by an unusual mapping from the graphics memory to the display, a shortage of memory on the graphics nodes, and small message buffers on the graphics nodes. They wrote some algorithms for collecting the columns of pixels from the hypercube nodes, and routing them to the appropriate graphics node. They also would have liked a better interconnection network between the graphics nodes, at least for synchronization.} } @InProceedings{bennett:jovian, author = {Robert Bennett and Kelvin Bryant and Alan Sussman and Raja Das and Joel Saltz}, title = {{Jovian}: A Framework for Optimizing Parallel {I/O}}, booktitle = {Proceedings of the Scalable Parallel Libraries Conference}, year = {1994}, month = {October}, pages = {10--20}, publisher = {IEEE Computer Society Press}, address = {Mississippi State, MS}, URL = {ftp://hpsl.cs.umd.edu/pub/papers/splc94.ps.Z}, keyword = {parallel I/O, pario-bib}, comment = {Jovian is a runtime library for use with SPMD codes, eg, HPF. They restrict IO to collective operations, and provide extra processes to 'coalesce' the many requests from multiple CPs into fewer larger requests to the operating system, perhaps optimized for access order. They mention that there is a standardization process underway for specifying data distributions. Also a compact representation for strided access to n-dimensional data structures. Coalescing basically means combining requests to eliminate duplication and to combine adjacent requests. Requests to coalescers are in full blocks, to lower the processing overhead. Nonetheless, their method involves moving requests around twice, and involve several memory-memory copies of the data, so their overhead is high.} } @Misc{berdahl:transport, author = {Lawrence Berdahl}, title = {Parallel Transport Protocol Proposal}, year = {1995}, month = {January 3,}, howpublished = {Lawrence Livermore National Labs}, note = {Draft}, earlier = {berdahl:woodenman}, URL = {ftp://ccsf.caltech.edu/pub/berdahl/Pio-1-3-95.ps}, keyword = {parallel I/O, network, supercomputer system, pario-bib}, comment = {An update of berdahl:woodenman, close to the final draft.} } @Misc{berdahl:woodenman, author = {Lawrence Berdahl}, title = {Parallel Data Exchange}, year = {1994}, month = {January 28,}, howpublished = {Lawrence Livermore National Labs}, note = {WoodenMan Proposal}, later = {berdahl:transport}, keyword = {parallel I/O, network, supercomputer system, pario-bib}, comment = {They describe a protocol for making parallel data transfers of arbitrary data sets from one set of data servers to another set of data servers. The goal is to be independent of specific architectures or even types of data servers, and to work on top of existing transport protocols. The data set is described using a gather set for the source and a scatter set for the destination, and using a linear address space as an intermediate representation. All the servers are contacted, they figure out who they need to talk, and exchange port information with them. Each pair exchanges votes on who will control the transfer (ie, who will control the order of the transfer), and on their maximum data rates. This information is used to settle on the control and set of ports to be used. This proposal is not final and is under active development, so it may change.} } @Article{berrendorf:paragon, author = {R. Berrendorf and H. Burg and U. Detert}, title = {Performance Characteristics of Parallel Computers: {Intel Paragon} Case Study}, journal = {{IT+TI} Informationstechnik und Technische Informatik}, year = {1995}, month = {April}, volume = {37}, number = {2}, pages = {37--45}, note = {(In German).}, keyword = {parallel computing, performance evaluation, parallel file system, pario-bib}, comment = {In German. They summarize typical performance of the Intel Paragon, including the communication performance and the parallel file-system performance.} } @TechReport{bershad:sio-os, author = {Brian Bershad and David Black and David DeWitt and Garth Gibson and Kai Li and Larry Peterson and Marc Snir}, title = {Operating System Support for High-Performance Parallel {I/O} Systems}, year = {1994}, number = {CCSF-40}, institution = {Scalable I/O Initiative}, address = {Caltech Concurrent Supercomputing Facilities, Caltech}, URL = {http://www.ccsf.caltech.edu/SIO/SIO_osfs.ps}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, comment = {Four major components: networking, memory servers, file system, and persistent object store. Networking part focuses on low-latency support communication within an application, between applications, and between machines (Bershad and Peterson). Memory servers, shared virtual memory, and checkpointing support (Kai Li). File systems support includes benchmarking, transparent informed prefetching (Gibson), a common interface for PFS and Vesta (Snir), and integrating secondary and tertiary storage systems (including the integration of the National Storage Lab's HPSS (see coyne:hpss) into this project in 1995). OSF/1 (Black) will be extended to support parallel file systems, extent-like behavior, and block coalescing. Persistent object store (DeWitt) is radical change to an object-oriented interface, transparent I/O (though extensible and changable with subclassing, presumably), and heterogeneous support via the Object Definition Language standard. Persistent objects may be integrated with the memory servers and shared virtual memory. See also poole:sio-survey, bagrodia:sio-character, choudhary:sio-language.} } @InProceedings{berson:multimedia, author = {Steven Berson and Leana Golubchik and Richard R. Muntz}, title = {Fault Tolerant Design of Multimedia Servers}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data}, year = {1995}, pages = {364--375}, publisher = {ACM Press}, keyword = {fault tolerance, multimedia, video on demand, parallel I/O, pario-bib} } @InProceedings{best:cmmdio, author = {Michael L. Best and Adam Greenberg and Craig Stanfill and Lewis W. Tucker}, title = {{CMMD I/O}: A Parallel {Unix I/O}}, booktitle = {Proceedings of the Seventh International Parallel Processing Symposium}, year = {1993}, pages = {489--495}, publisher = {IEEE Computer Society Press}, address = {Newport Beach, CA}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, comment = {Much like Intel CFS, with different I/O modes that determine when the compute nodes synchronize, and the semantics of I/Os written to the file. They found it hard to get good bandwidth for independent I/Os, as opposed to coordinated I/Os; part of this was due to their RAID~3 disk array, but it is more complicated than that. Some performance numbers were given in talk.} } @InProceedings{bestavros:raid, author = {Azer Bestavros}, title = {{IDA}-Based Redundant Arrays of Inexpensive Disks}, booktitle = {Proceedings of the First International Conference on Parallel and Distributed Information Systems}, year = {1991}, month = {December}, pages = {2--9}, keyword = {RAID, disk array, reliability, parallel I/O, pario-bib}, comment = {Uses the Information Dispersal Algorithm (IDA) to generate $n+m$ blocks from $n$ blocks, to tolerate $m$ disk failures; all of the data from the $n$ blocks is hidden in the $n+m$ blocks. Not with the RAID project.} } @InProceedings{bitton:schedule, author = {Dina Bitton}, title = {Arm Scheduling in Shadowed Disks}, booktitle = {Proceedings of IEEE Compcon}, year = {1989}, month = {Spring}, pages = {132--136}, keyword = {parallel I/O, disk shadowing, reliability, disk mirroring, disk optimization, pario-bib}, comment = {Goes further than bitton:shadow. Uses simulation to verify results from that paper, which were expressions for the expected seek distance of shadowed disks, using shortest-seek-time arm scheduling. Problem is her assumption that arm positions stay independent, in the face of correlating effects like writes, which move all arms to the same place. Simulations match model only barely, and only in some cases. Anyway, shadowed disks can improve performance for workloads more than 60 or 70\% reads.} } @InProceedings{bitton:shadow, author = {D. Bitton and J. Gray}, title = {Disk Shadowing}, booktitle = {Proceedings of the 14th International Conference on Very Large Data Bases}, year = {1988}, pages = {331--338}, keyword = {parallel I/O, disk shadowing, reliability, disk mirroring, disk optimization, pario-bib}, comment = {Also TR UIC EECS 88-1 from Univ of Illinois at Chicago. Shadowed disks are mirroring with more than 2 disks. Writes to all disks, reads from one with shortest seek time. Acknowledges but ignores problem posed by lo:disks. Also considers that newer disk technology does not have linear seek time $(a+bx)$ but rather $(a+b\sqrt{x})$. Shows that with either seek distribution the average seek time for workloads with at least 60\% reads decreases in the number of disks. See also bitton:schedule.} } @InProceedings{bjorstad:structure, author = {P. E. Bj{\o}rstad and J. Cook}, title = {Large Scale Structural Analysis On Massively Parallel Computers}, booktitle = {Linear Algebra for Large Scale and Real-Time Applications}, year = {1993}, pages = {3--11}, publisher = {Kluwer Academic Publishers}, note = {ftp from ftp.ii.uib.no in \verb+pub/tech_reports/mpp_sestra.ps.Z+.}, URL = {file://ftp.ii.uib.no/pub/tech_reports/mpp_sestra.ps.Z}, keyword = {parallel I/O, file access pattern, pario-bib}, comment = {A substantial part of this structural-analysis application was involved in I/O, moving substructures in and out of RAM. The Maspar IO-RAM helped a lot, nearly halving the time required. On the Cray, the SSD had an even bigger impact, perhaps 7--12 times faster. Their main conclusion is that caching helped. Most likely this was due to its double-buffering, since they structured the code to read/compute/write in large ``superblocks''.} } @Article{boral:bubba, author = {Haran Boral and William Alexander and Larry Clay and George Copeland and Scott Danforth and Michael Franklin and Brian Hart and Marc Smith and Patrick Valduriez}, title = {Prototyping {Bubba}, a Highly Parallel Database System}, journal = {IEEE Transactions on Knowledge and Data Engineering}, year = {1990}, month = {March}, volume = {2}, number = {1}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, database, disk caching, pario-bib}, comment = {More recent than copeland:bubba, and a little more general. This gives few details, and doesn't spend much time on the parallel I/O. Bubba does use parallel independent disks, with a significant effort to place data on the disks, and do the work local to the disks, to balance the load and minimize interprocessor communication. Also they use a single-level store (i.e., memory-mapped files) to improve performance of their I/O system, including page locking that is assisted by the MMU. The OS has hooks for the database manager to give memory-management policy hints.} } @InProceedings{boral:critique, author = {H. Boral and D. {DeWitt}}, title = {Database machines: an idea whose time has passed?}, booktitle = {Proceedings of the Second International Workshop on Database Machines}, year = {1983}, pages = {166--187}, publisher = {Springer-Verlag}, keyword = {file access pattern, parallel I/O, database machine, pario-bib}, comment = {Improvements in I/O bandwidth crucial for supporting database machines, otherwise highly parallel DB machines are useless (I/O bound). Two ways to do it: 1) synchronized interleaving by using custom controller and regular disks to read/write same track on all disks, which speeds individual accesses. 2) use very large cache (100-200M) to keep blocks to re-use and to do prefetching. But see dewitt:pardbs.} } @TechReport{bordawekar:collective, author = {Rajesh Bordawekar}, title = {Implementation and Evaluation of Collective {I/O} in the {Intel Paragon Parallel File System}}, year = {1996}, month = {November}, number = {CACR~TR-128}, institution = {Center of Advanced Computing Research, California Insititute of Technology}, URL = {http://www.cacr.caltech.edu/~rajesh/collective.html}, keyword = {parallel I/O, mutliprocessor file system, pario-bib}, abstract = {A majority of parallel applications obtain parallelism by partitioning data over multiple processors. Accessing distributed data structures like arrays from files often requires each processor to make a large number of small non-contiguous data requests. This problem can be addressed by replacing small non-contiguous requests by large collective requests. This approach, known as Collective I/O, has been found to work extremely well in practice. In this paper, we describe implementation and evaluation of a collective I/O prototype in a production parallel file system on the Intel Paragon. The prototype is implemented in the PFS subsystem of the Intel Paragon Operating System. We evaluate the collective I/O performance using its comparison with the PFS M_RECORD and M_UNIX I/O modes. It is observed that collective I/O provides significant performance improvement over accesses in M_UNIX mode. However, in many cases, various implementation overheads cause collective I/O to provide lower performance than the M_RECORD I/O mode.} } @InProceedings{bordawekar:comm, author = {Rajesh Bordawekar and Alok Choudhary}, title = {Communication Strategies for Out-of-core Programs on Distributed Memory Machines}, booktitle = {Proceedings of the 9th ACM International Conference on Supercomputing}, year = {1995}, month = {July}, pages = {395--403}, publisher = {ACM Press}, address = {Barcelona}, earlier = {bordawekar:comm-tr}, keyword = {parallel I/O, inter-processor communication, pario-bib}, comment = {bordawekar:comm-tr is nearly identical in content. Also bordawekar:commstrat is a shorter version.} } @TechReport{bordawekar:comm-tr, author = {Rajesh Bordawekar and Alok Choudhary}, title = {Communication Strategies for Out-of-core Programs on Distributed Memory Machines}, year = {1994}, number = {SCCS-667}, institution = {NPAC, Syracuse University}, later = {bordawekar:comm}, URL = {http://www.npac.syr.edu/pub/by_index/sccs/papers/ps/0660/sccs-0667.ps.Z}, keyword = {parallel I/O, inter-processor communication, pario-bib}, abstract = {In this paper, we show that communication in the out-of-core distributed memory problems requires both inter-processor communication and file I/O. Given that primary data structures reside in files, even communication requires I/O. Thus, it is important to optimize the I/O costs associated with a communication step. We present three methods for performing communication in out-of-core distributed memory problems. The first method, termed as the "out-of-core" communication method, follows a loosely synchronous model. Computation and Communication phases in this case are clearly separated, and communication requires permutation of data in files. The second method, termed as "demand-driven-in-core communication" considers only communication required of each in-core data slab individually. The third method, termed as "producer-driven-in-core communication" goes even one step further and tries to identify the potential (future) use of data while it is in memory. We describe these methods in detail and provide performance results for out-of-core applications; namely, two-dimensional FFT and two-dimensional elliptic solver. Finally, we discuss how "out-of-core" and "in-core" communication methods could be used in virtual memory environments on distributed memory machines.}, comment = {They compare different ways to do global communications in out-of-core applications, involving file I/O and communication at different times. They also comment briefly on how it would work if it depended on virtual memory at each node.} } @InProceedings{bordawekar:commstrat, author = {Rajesh Bordawekar and Alok Choudhary}, title = {Communication strategies for out-of-core programs on distributed memory machines}, booktitle = {Proceedings of the 1995 International Conference on High Performance Computing}, year = {1995}, month = {December}, pages = {130--135}, address = {New Delhi, India}, earlier = {bordawekar:comm}, keyword = {interprocessor communication, parallel I/O, pario-bib}, comment = {Small version of bordawekar:comm.} } @Article{bordawekar:compcomm, author = {Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {Compilation and Communication Strategies for Out-of-core programs on Distributed-Memory Machines}, journal = {Journal of Parallel and Distributed Computing}, year = {1996}, month = {November}, volume = {38}, number = {2}, pages = {277--288}, publisher = {Academic Press}, earlier = {bordawekar:compcomm-tr}, keyword = {compiler, communication, out-of-core, parallel I/O, inter-processor communication, pario-bib}, abstract = {It is widely acknowledged that improving parallel I/O performance is critical for widespread adoption of high performance computing. In this paper, we show that communication in out-of-core distributed memory problems may require both inter-processor communication and file I/O. Thus, in order to improve I/O performance, it is necessary to minimize the I/O costs associated with a communication step. We present three methods for performing communication in out-of-core distributed memory problems. The first method called the generalized collective communication method follows a loosely synchronous model; computation and communication phases are clearly separated, and communication requires permutation of data in files. The second method called the receiver-driven in-core communication considers only communication required of each in-core data slab individually. The third method called the owner-driven in-core communication goes even one step further and tries to identify the potential future use of data (by the recipients) while it is in the sender's memory. We describe these methods in detail and present a simple heuristic to choose a communication method from among the three methods. We then provide performance results for two out-of-core applications, the two-dimensional FFT code and the two-dimensional elliptic Jacobi solver. Finally, we discuss how the out-of-core and in-core communication methods can be used in virtual memory environments on distributed memory machines.} } @TechReport{bordawekar:compcomm-tr, author = {Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {Compilation and Communication Strategies for Out-of-core programs on Distributed Memory Machines}, year = {1995}, month = {November}, number = {CACR-113}, institution = {Scalable I/O Initiative, Center of Advanced Computing Research, California Insititute of Technology}, later = {bordawekar:compcomm}, URL = {http://www.cat.syr.edu/~rajesh/cacr113.ps}, abstract = {It is widely acknowledged that improving parallel I/O performance is critical for widespread adoption of high performance computing. In this paper, we show that communication in out-of-core distributed memory problems may require both inter-processor communication and file I/O. Thus, in order to improve I/O performance, it is necessary to minimize the I/O costs associated with a communication step. We present three methods for performing communication in out-of-core distributed memory problems. The first method called the generalized collective communication method follows a loosely synchronous model; computation and communication phases are clearly separated, and communication requires permutation of data in files. The second method called the receiver-driven in-core communication considers only communication required of each in-core data slab individually. The third method called the owner-driven in-core communication goes even one step further and tries to identify the potential future use of data (by the recipients) while it is in the sender's memory. We describe these methods in detail and present a simple heuristic to choose a communication method from among the three methods. We then provide performance results for two out-of-core applications, the two-dimensional FFT code and the two-dimensional elliptic Jacobi solver. Finally, we discuss how the out-of-core and in-core communication methods can be used in virtual memory environments on distributed memory machines.}, comment = {See also bordawekar:comm, at ICS'95.} } @InCollection{bordawekar:compiling, author = {Rajesh Bordawekar and Alok Choudhary}, title = {Issues in Compiling {I/O} Intensive Problems}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {3}, editor = {Ravi Jain and John Werth and James C. Browne}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {69--96}, publisher = {Kluwer Academic Publishers}, keyword = {parallel I/O, compiler, out-of-core, pario-bib}, abstract = {None.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @InProceedings{bordawekar:delta-fs, author = {Rajesh Bordawekar and Alok Choudhary and Juan Miguel Del Rosario}, title = {An Experimental Performance Evaluation of {Touchstone Delta Concurrent File System}}, booktitle = {Proceedings of the 7th ACM International Conference on Supercomputing}, year = {1993}, pages = {367--376}, publisher = {ACM Press}, earlier = {bordawekar:delta-fs-TR}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/ics93.ps.Z}, keyword = {performance evaluation, multiprocessor file system, parallel I/O, pario-bib}, abstract = {For a high-performance parallel machine to be a scalable system, it must also have a scalable parallel I/O system. Recently, several commercial machines (e.g. Intel Touchstone Delta, Paragon, CM-5, Ncube-2) have been built that provide features for parallel I/O. However, very little is understood about the performance of these I/O systems. This paper presents an experimental evaluation of the Intel Touchstone Delta's Concurrent File System (CFS). The CFS utilizes the declustering of large files across the disks to improve the I/O performance. Data files can be read or written on the CFS using 4 access modes. \par We present performance measurements for the CFS on the Touchstone Delta with 512 compute nodes and 32 I/O nodes. The study focuses on file read/write rates for various configurations of I/O and compute nodes. The study attempts to show the effect of access modes, buffer sizes and volume restrictions on the system performance. The paper also shows that the performance of the CFS can greatly vary for various data distributions commonly employed in scientific and engineering applications.}, comment = {Some new numbers over bordawekar:delta-fs-TR, but basically the same conclusions.} } @TechReport{bordawekar:delta-fs-TR, author = {Rajesh Bordawekar and Alok Choudhary and Juan Miguel Del Rosario}, title = {An Experimental Performance Evaluation of {Touchstone Delta Concurrent File System}}, year = {1992}, number = {SCCS-420}, institution = {NPAC, Syracuse University}, later = {bordawekar:delta-fs}, keyword = {performance evaluation, multiprocessor file system, parallel I/O, pario-bib}, comment = {Evaluating the Caltech Touchstone Delta (512 nodes, 32 I/O nodes, 64 disks, 8 MB cache per I/O node). Basic measurements of different access patterns and I/O modes. Location in network doesn't seem to matter. Throughput is often limited by the software; at least, the full hardware throughputs are rarely obtained. Sometimes they are compnode-limited, and other times they may be being limited by the cache management. There must be a way to push bottleneck back to the disks .} } @TechReport{bordawekar:efficient, author = {Rajesh Bordawekar and Rajeev Thakur and Alok Choudhary}, title = {Efficient Compilation of Out-of-core Data Parallel Programs}, year = {1994}, month = {April}, number = {SCCS-622}, institution = {NPAC}, later = {bordawekar:reorganize}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/access_reorg.ps.Z}, keyword = {parallel I/O, compiler, pario-bib}, abstract = {Large scale scientific applications, such as the Grand Challenge applications, deal with very large quantities of data. The amount of main memory in distributed memory machines is usually not large enough to solve problems of realistic size. This limitation results in the need for system and application software support to provide efficient parallel I/O for out-of-core programs. This paper describes techniques for translating out-of-core programs written in a data parallel language like HPF to message passing node programs with explicit parallel I/O. We describe the basic compilation model and various steps involved in the compilation. The compilation process is explained with the help of an out-of-core matrix multiplication program. We first discuss how an out-of-core program can be translated by extending the method used for translating in-core programs. We then describe how the compiler can optimize the code by estimating the I/O costs associated with different array access patterns and selecting the method with the least I/O cost. This optimization can reduce the amount of I/O by as much as an order of magnitude. Performance results on the Intel Touchstone Delta are presented and analyzed.}, comment = {Revised as bordawekar: This is actually fairly different from thakur:runtime. They describe the same basic compiler technique, where arrays are distributed across processors, and each processor has a local array file for holding data from its local partitions. Then the I/O needed for a loop is broken into slabs, where the program proceeds as an alternation of (read slabs, compute, write slabs). The big new thing here is that the compiler tries different ways to form slabs (e.g., by row or by column), estimates the number of I/Os and the amount of data moved for each case, and chooses the case with the smallest amount of I/O. They also mention how the choice of memory size allocated to different arrays affects the amount of IO, but give no algorithm other than "try all the possibilities."} } @TechReport{bordawekar:framework, author = {Rajesh Bordawekar and Alok Choudhary}, title = {A Framework for Representing Data Parallel Programs and its Application in Program Reordering}, year = {1995}, month = {March}, number = {SCCS-698}, institution = {NPAC, Syracuse University}, URL = {http://www.npac.syr.edu/techreports/html/0650/abs-0698.html}, keyword = {data parallel, parallel I/O, pario-bib}, comment = {Although this is mostly a compilers paper, there is a little bit about parallel I/O here. They comment briefly on how their compiler framework will help them make a compiler that can provide advice to the file system about prefetching and cache replacement, and to decide on the layout of scratch files to optimize locality.} } @TechReport{bordawekar:hpf, author = {Rajesh Bordawekar and Alok Choudhary}, title = {{HPF} with Parallel {I/O} Extensions}, year = {1993}, number = {SCCS-613}, institution = {NPAC, Syracuse University}, URL = {http://www.npac.syr.edu/techreports/ps/0600/sccs-0613.ps.Z}, keyword = {parallel I/O, pario-bib}, comment = {They propose some extensions to HPF to accomodate parallel I/O.} } @TechReport{bordawekar:hpfio, author = {Rajesh Bordawekar and Alok Choudhary}, title = {Extending {I/O} Capabilities of {High Performance Fortran}: Initial Experiences}, year = {1995}, month = {December}, number = {CACR-115}, institution = {Scalable I/O Initiative, Center of Advanced Computing Research, California Insititute of Technology}, keyword = {parallel I/O, compiler, FORTRAN, HPF, pario-bib}, abstract = {This report presents implementation details of the prototype PASSION compiler. The PASSION compiler provides support for: (1) Accessing multidimensional in-core arrays and (2) Out-of-core computations. The PASSION compiler takes as input an annotated I/O intensive (either an out-of-core program or program accessing distributed arrays from files) High Performance Fortran (HPF) program. Using hints provided by the user, the compiler modifies the computation so as to minimize the I/O cost and restructures the program to incorporate explicit I/O calls. In this report, compilation of out-of-core FORALL constructs is illustrated using representative programs. Compiler support for accessing distributed in-core data is explained using illustrative examples and supplemented by experimental results.}, comment = {Currently not available on WWW. Describes implementation details of the PASSION Compiler.} } @InProceedings{bordawekar:model, author = {Rajesh Bordawekar and Alok Choudhary and Ken Kennedy and Charles Koelbel and Michael Paleczny}, title = {A Model and Compilation Strategy for Out-of-core Data Parallel Programs}, booktitle = {Proceedings of the Fifth ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, year = {1995}, month = {July}, pages = {1--10}, publisher = {ACM Press}, address = {Santa Barbara, CA}, note = {Also available as the following technical reports: NPAC Technical Report SCCS-0696, CRPC Technical Report CRPC-TR94507-S, SIO Technical Report CACR SIO-104}, earlier = {bordawekar:model-tr}, URL = {http://www.cacr.caltech.edu/techpubs/PAPERS/cacr104.ps}, keyword = {parallel I/O, compiler, pario-bib}, abstract = {It is widely acknowledged in high-performance computing circles that parallel input/output needs substantial improvement in order to make scalable computers truly usable. We present a data storage model that allows processors independent access to their own data and a corresponding compilation strategy that integrates data-parallel computation with data distribution for out-of-core problems. Our results compare several communication methods and I/O optimizations using two out-of-core problems, Jacobi iteration and LU factorization.} } @TechReport{bordawekar:model-tr, author = {Rajesh Bordawekar and Alok Choudhary and Ken Kennedy and Charles Koebel and Mike Paleczny}, title = {A Model and Compilation Strategy for Out-of-Core Data Parallel Programs}, year = {1994}, month = {December}, number = {CRPC-TR94507-S}, institution = {CRPC}, later = {bordawekar:model}, URL = {gopher://softlib.rice.edu/99/softlib/CRPC-TRs/reports/CRPC-TR94507-S.ps}, keyword = {compilers, parallel I/O, out-of-core applications, pario-bib}, comment = {Basically a summary of their I/O and compilation model for out-of-core compilation of HPF programs. See also paleczny:support.} } @MastersThesis{bordawekar:msthesis, author = {Rajesh R. Bordawekar}, title = {Issues in Software Support for Parallel {I/O}}, year = {1993}, month = {May}, school = {Syracuse University}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/msthesis.ps.Z}, keyword = {parallel I/O, pario-bib}, abstract = {This thesis looks at various issues in providing application-level software support for parallel I/O. We show that the performance of the parallel I/O system varies greatly as a function of data distributions. We present runtime I/O primitives for parallel languages which allow the user to obtain a consistent performance over a wide range of data distributions. \par In order to design these primitives, we study various parameters used in the design of a parallel file system. We evaluate the performance of Touchstone Delta Concurrent File System and study the effect of parameters like number of processors, number of disks, file size on the system performance. We compute the I/O costs for common data distributions. We propose an alternative strategy -two phase data access strategy- to optimize the I/O costs connected with data distributions. We implement runtime primitives using the two-phase access strategy and show that using these primitives not only I/O access rates are improved but also user can obtain complex data distributions like block-block and block-cyclic.}, comment = {This is basically a consolidation of the other bordawekar papers, in more detail. So he covers an experimental analysis of the touchstone delta; of the problems arising from the direct-access model for non-conforming distributions; of the two-phase model; and of the run-time library to support two-phase access. See also bordawekar:reorganize, thakur:runtime, bordawekar:efficient, thakur:out-of-core, delrosario:two-phase, bordawekar:primitives, bordawekar:delta-fs.} } @InProceedings{bordawekar:placement, author = {Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {A Framework for Integrated Communication and {I/O} Placement}, booktitle = {Proceedings of the 2nd International Euro-Par'96, Parallel Processing}, year = {1996}, month = {August}, series = {Lecture Notes in Computer Science}, volume = {1124}, pages = {541--552}, publisher = {Springer-Verlag}, earlier = {bordawekar:placement-tr}, URL = {http://www.cacr.caltech.edu/~rajesh/europar-rajesh.ps}, keyword = {parallel I/O, compiler, pario-bib}, abstract = {This paper describes a framework for analyzing dataflow within an out-of-core parallel program. Dataflow properties of FORALL statement are analyzed and a unified I/O and communication placement framework is presented. This placement framework can be applied to many problems, which include eliminating redudant I/O incurred in communication. The framework is validated by applying it for optimizing I/O and communication in out-of-core stencil problems. Experimental performance results on an Intel Paragon show significant reduction in I/O and communication overhead.} } @TechReport{bordawekar:placement-tr, author = {Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {A Framework for Integrated Communication and {I/O} Placement}, year = {1996}, month = {February}, number = {CACR-117}, institution = {Scalable I/O Initiative, Center of Advanced Computing Research, California Insititute of Technology}, later = {bordawekar:placement}, URL = {http://www.cacr.caltech.edu/~rajesh/cacr117.ps}, keyword = {parallel I/O, compiler, pario-bib}, abstract = {In this paper, we describe a framework for optimizing communication and I/O costs in out-of-core problems. We focus on communication and I/O optimization within a FORALL construct. We show that existing frameworks do not extend directly to out-of-core problems and can not exploit the FORALL semantics. We present a unified framework for the placement of I/O and communication calls and apply it for optimizing communication for stencil applications. Using the experimental results, we demonstrate that correct placement of I/O and communication calls can completely eliminate extra file I/O from communication and obtain significant performance improvement.} } @InProceedings{bordawekar:primitives, author = {Rajesh Bordawekar and Juan Miguel {del Rosario} and Alok Choudhary}, title = {Design and Evaluation of Primitives for Parallel {I/O}}, booktitle = {Proceedings of Supercomputing '93}, year = {1993}, pages = {452--461}, publisher = {IEEE Computer Society Press}, address = {Portland, OR}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/sc93.ps.Z}, keyword = {parallel I/O, pario-bib}, abstract = {In this paper, we show that the performance of parallel file systems can vary greatly as a function of the selected data distributions, and that some data distributions can not be supported. Also, we describe how the parallel language extensions, though simplifying the programming, do not address the performance problems found in parallel file systems. \par We have devised an alternative scheme for conducting parallel I/O - the Two-Phase Access Strategy - which guarantees higher and more consistent performance over a wider spectrum of data distributions. We have designed and implemented runtime primitives that make use of the two-phase access strategy to conduct parallel I/O, and facilitate the programming of parallel I/O operations. We describe these primitives in detail and provide performance results which show that I/O access rates are improved by up to several orders of magnitude. Further, we show that the variation in performance over various data distributions is restricted to within a factor of 2 of the best access rate.}, comment = {Much of this is the same as delrosario:two-phase, except for section~4 where they describe their actual run-time library of primitives, with a little bit about how it works. It's not clear, for example, how their meta-data structures are distributed across the machine. They also do not describe their methods for the data redistribution.} } @TechReport{bordawekar:reorganize, author = {Rajesh Bordawekar and Alok Choudhary and Rajeev Thakur}, title = {Data Access Reorganizations in Compiling Out-of-core Data Parallel Programs on Distributed Memory Machines}, year = {1994}, month = {September}, number = {SCCS-622}, institution = {NPAC}, address = {Syracuse, NY 13244}, earlier = {bordawekar:efficient}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/access_reorg.ps.Z}, keyword = {parallel I/O, compilation, pario-bib}, comment = {Basically they give a case study of out-of-core matrix multiplication to emphasize that the compiler's choice of loop ordering and matrix distribution for in-core matmult is not a very good choice for out-of-core matmult, because it causes too much I/O. By reorganizing the data and the loops, they get much better performance. In this particular case there are known algorithms which they should have used. In general they make the point that the compiler should consider several organizations, and estimate their costs, before generating code. They don't propose anything more sophisticated than to try all the possible organizations.} } @InProceedings{bordawekar:stencil, author = {Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {Automatic Optimization of Communication in Compiling Out-of-core Stencil Codes}, booktitle = {Proceedings of the 10th ACM International Conference on Supercomputing}, year = {1996}, month = {May}, pages = {366--373}, publisher = {ACM Press}, address = {Philadelphia, PA}, earlier = {bordawekar:stencil-tr}, URL = {http://www.cat.syr.edu/~rajesh/ics96.ps}, keyword = {compiler, parallel I/O, pario-bib}, abstract = {In this paper, we describe a technique for optimizing commununication for out-of-core distributed memory stencil problems. In these problems, communication may require both inter-processor communication and file I/O. We show that in certain cases, extra file I/O incurred in communication can be completely eliminated by reordering in-core computations. The in-core computation pattern is decided by: (1) how the out-of-core data distributed into in-core slabs (tiling) and (2) how the slabs are accessed. We show that a compiler using the stencil and processor information can choose the tiling parameters and schedule the tile accesses so that the extra file I/O is eliminated and overall performance is improved.} } @TechReport{bordawekar:stencil-tr, author = {Rajesh Bordawekar and Alok Choudhary and J. Ramanujam}, title = {Automatic Optimization of Communication in Out-of-core Stencil Codes}, year = {1995}, month = {November}, number = {CACR-114}, institution = {Scalable I/O Initiative, Center of Advanced Computing Research, California Insititute of Technology}, later = {bordawekar:stencil}, keyword = {compiler, parallel I/O, pario-bib}, abstract = {In this paper, we describe a technique for optimizing commununication for out-of-core distributed memory stencil problems. In these problems, communication may require both inter-processor communication and file I/O. We show that in certain cases, extra file I/O incurred in communication can be completely eliminated by reordering in-core computations. The in-core computation pattern is decided by: (1) how the out-of-core data distributed into in-core slabs (tiling) and (2) how the slabs are accessed. We show that a compiler using the stencil and processor information can choose the tiling parameters and schedule the tile accesses so that the extra file I/O is eliminated and overall performance is improved.} } @InProceedings{bordawekar:support, author = {Rajesh Bordawekar and Alok Choudhary}, title = {Compiler and Runtime Support For Parallel {I/O}}, booktitle = {Proceedings of IFIP Working Conference (WG10.3) on Programming Environments for Massively Parallel Distributed Systems}, year = {1994}, month = {April}, publisher = {Birkhaeuser Verlag AG, Basel, Switzerland}, address = {Monte Verita, Ascona, Switzerland}, keyword = {parallel I/O, pario-bib}, comment = {Contains much of the material from bordawekar:hpf.} } @PhdThesis{bordawekar:thesis, author = {Rajesh Bordawekar}, title = {Techniques for Compiling {I/O} Intensive Parallel Programs}, year = {1996}, month = {April}, school = {Electrical and Computer Engineering Dept., Syracuse University}, note = {Also available as Caltech technical report CACR-118}, URL = {http://www.cat.syr.edu/~rajesh/thesis.html}, keyword = {parallel I/O, compiler, HPF, pario-bib}, abstract = {This dissertation investigates several issues in providing compiler support for I/O intensive parallel programs. In this dissertation, we focus on satisfying two I/O requirements, namely, support for accessing multidimensional arrays and support for {\it out-of-core} computations. We analyze working spaces in I/O intensive programs and propose three execution models to be used by users or compilers for developing efficient I/O intensive parallel programs. Different phases in compiling out-of-core parallel programs are then described. Three different methods for performing communication are presented and validated using representative application templates.We illustrate that communication in out-of-core programs may require both inter-processor communication and file I/O. We show that using the {\it copy-in-copy-out} semantics of the HPF {\tt FORALL} construct, extra file I/O incurred in communication can be completely eliminated by reordering in-core computations. Two different approaches for reordering in-core computations are presented, namely, integrated tiling and scheduling heuristic, and dataflow framework for placing communication and I/O calls. The discussion is supplemented with experimental performance results of representative stencil applications. Finally, an overview of the prototype \textsf{PASSION} (Parallel And Scalable Software for I/O) compiler is presented. This compiler takes an annotated out-of-core High Performance Fortran (HPF) program as input and generates the corresponding {\it node+message-passing} program with calls to the parallel I/O runtime library. We illustrate various functionalities of the compiler using example programs and supplement them by experimental results.} } @InProceedings{bornstein:reshuffle, author = {C. Bornstein and P. Steenkiste}, title = {Data Reshuffling in Support of Fast {I/O} For Distributed-Memory Machines}, booktitle = {Proceedings of the Third IEEE International Symposium on High Performance Distributed Computing}, year = {1994}, month = {August}, pages = {227--235}, keyword = {parallel I/O, distributed memory, pario-bib}, comment = {In a sense, this is about a two-phase technique for network I/O. They consider the problem of feeding a fast network interface (HIPPI) from a distributed-memory parallel machine (iWARP) in which the individual internal links are slower than the external network. So they get the processors to cooperate to reshuffle the data into a canonical layout that is convenient to send to the gateway node, and from there onto the external network.} } @InProceedings{bradley:ipsc2io, author = {David K. Bradley and Daniel A. Reed}, title = {Performance of the {Intel iPSC/2} Input/Output System}, booktitle = {Proceedings of the Fourth Conference on Hypercube Concurrent Computers and Applications}, year = {1989}, pages = {141--144}, publisher = {Golden Gate Enterprises, Los Altos, CA}, address = {Monterey, CA}, keyword = {hypercube, parallel I/O, Intel, pario-bib}, comment = {Some measurements and simulations of early CFS performance. Looks terrible, but they disclaim that it is a beta version of the first CFS. They determined that the disks are the bottleneck. But this may just imply that they need more disks. Their parallel synthetic applications had each process read a separate file. CFS had ridiculous traffic overhead. Again, this was beta CFS.} } @TechReport{brandwijn:dasd, author = {Alexandre Brandwajn}, title = {Performance Benefits of Parallelism in Cached {DASD} Controllers}, year = {1988}, month = {November}, number = {UCSC-CRL-88-30}, institution = {Computer Research Laboratory, UC Santa Cruz}, keyword = {parallel I/O, disk caching, disk architecture, pario-bib}, comment = {Some new DASD products with caches overlap cache hits with prefetch of remainder of track into cache. They use analytical model to evaluate performance of these. They find performance improvements of 5-15 percent under their assumptions.} } @InProceedings{brezany:HPF, author = {Peter Brezany and Michael Gernt and Piyush Mehotra and Hans Zima}, title = {Concurrent File Operations in a {High Performance FORTRAN}}, booktitle = {Proceedings of Supercomputing '92}, year = {1992}, pages = {230--237}, keyword = {supercomputing, fortran, multiprocessor file system interface, pario-bib}, comment = {Describing their way of writing arrays to files so that they are written in a fast, parallel way, and so that (if read in same distribution) they can be read fast and parallel. Normal read and write forces standard ordering, but cread and cwrite uses a compiler and runtime selected ordering, which is stored in the file so it can be used when rereading. Good for temp files.} } @InProceedings{brezany:architecture, author = {Peter Brezany and Thomas A. Mueck and Erich Schikuta}, title = {A Software Architecture for Massively Parallel Input-Output}, booktitle = {Third International Workshop PARA'96 (Applied Parallel Computing - Industrial Computation and Optimization)}, year = {1996}, month = {August}, series = {Lecture Notes in Computer Science}, volume = {1186}, pages = {85--96}, publisher = {Springer-Verlag}, address = {Lyngby, Denmark}, note = {Also available as Technical Report of the Inst. f.~Angewandte Informatik u. Informationssysteme, University of Vienna, TR~96202}, URL = {http://www.pri.univie.ac.at/~schiki/research/paper/para96/para96.ps}, keyword = {compiler transformations, runtime support, parallel I/O, prefetching, pario-bib}, abstract = {For an increasing number of data intensive scientific applications, parallel I/O concepts are a major performance issue. Tackling this issue, we provide an outline of an input/output system designed for highly efficient, scalable and conveniently usable parallel I/O on distributed memory systems. The main focus of this paper is the parallel I/O runtime system support provided for software-generated programs produced by parallelizing compilers in the context of High Performance FORTRAN efforts. Specifically, our design is presented in the context of the Vienna Fortran Compilation System.} } @InProceedings{brezany:compiling, author = {Peter Brezany and Thomas A. Mueck and Erich Schikuta}, title = {Mass Storage Support for a Parallelizing Compilation System}, booktitle = {International Conference Eurosim'96-- HPCN challenges in Telecomp and Telecom: Parallel Simulation of Complex Systems and Large Scale Applications}, year = {1996}, month = {June}, pages = {63--70}, publisher = {North-Holland, Elsevier Science}, address = {Delft, The Netherlands}, URL = {http://www.pri.univie.ac.at/~schiki/research/paper/eurosim96/eurosim96.ps}, keyword = {parallel I/O, high performance mass storage system, high performance languages, compilation techniques, data administration, pario-bib} } @InProceedings{brezany:io-support, author = {Peter Brezany and Thomas A. Mueck and Erich Schikuta}, title = {Language, Compiler and Parallel Database Support for {I/O} Intensive Applications}, booktitle = {Proceedings of the International Conference on High Performance Computing and Networking}, year = {1995}, month = {May}, series = {Lecture Notes in Computer Science}, volume = {919}, pages = {14--20}, publisher = {Springer-Verlag}, address = {Milan, Italy}, note = {also available as Technical Report of the Inst. f.~Software Technology and Parallel Systems, University of Vienna, TR95-8, 1995}, URL = {http://www.pri.univie.ac.at/~schiki/research/paper/techrep/tr95-8.ps}, keyword = {compiler transformations, runtime support, declustering, parallel I/O, pario-bib}, comment = {They describe some extensions to Vienna Fortran that support parallel I/O, and how they plan to extend the compiler and run-time system to help. They are somewhat short on details, however. The basic idea is that file declustering is based on hints from the compiler or programmer about how the file will be used, eg, as a matrix distributed in thus-and-so way.} } @TechReport{brezany:irregular-tr, author = {P. Brezany and A. Choudhary}, title = {Techniques and Optimizations for Developing Irregular Out-of-Core Applications on Distributed-Memory Systems}, year = {1996}, month = {November}, number = {96-4}, institution = {Institute for Software Technology and Parallel Systems, University of Vienna}, URL = {http://www.pri.univie.ac.at/~schiki/research/vipios/paper/brezany-choudhary.ps}, keyword = {parallel I/O, out of core, irregular applications, compiler, pario-bib} } @InProceedings{broom:acacia, author = {Bradley M. Broom}, title = {A Synchronous File Server for Distributed File Systems}, booktitle = {Proceedings of the 16th Australian Computer Science Conference}, year = {1993}, earlier = {broom:acacia-tr}, keyword = {distributed file system, pario-bib}, comment = {See broom:acacia-tr. See also broom:impl, lautenbach:pfs, mutisya:cache, and broom:cap.} } @TechReport{broom:acacia-tr, author = {Bradley M. Broom}, title = {A Synchronous File Server for Distributed File Systems}, year = {1992}, month = {August}, number = {TR--CS--92--12}, institution = {Dept. of Computer Science, Australian National University}, later = {broom:acacia}, keyword = {distributed file system, pario-bib}, comment = {This paper is not specifically about parallel I/O, but the file system will be used in the AP-1000 multiprocessor. Acacia is a file server that is optimized for synchronous writes, like those used in stateless protocols (eg, NFS). It writes inodes in blocks in any free location that is close to the current head position, using indirect inode blocks to track those. Indirect blocks are in turn written anywhere convenient, and their positions are tracked by the superblock. There is one slot in each cylinder reserved for the superblock, which is timestamped. They get good performance but claim to need a better implementation, and a faster allocation algorithm. No indication of effect on read performance.} } @InProceedings{broom:cap, author = {Bradley M. Broom and Robert Cohen}, title = {Acacia: A Distributed, Parallel File System for the {CAP-II}}, booktitle = {Proceedings of the First Fujitsu-ANU CAP Workshop}, year = {1990}, month = {November}, keyword = {distributed file system, multiprocessor file system, pario-bib}, comment = {See also broom:acacia, broom:impl, lautenbach:pfs, and mutisya:cache. This describes the semantic model for their file system. Modelled a lot after Amoeba, they have capabilities that represent immutable files. There are create, destroy, read, and write operations, but the read and write can affect only part of the file, if desired. They also have an atomic ``copy'' operation, which creates a snapshot of the current state of the file. They also have ``spawn'' and ``merge'' operations, which are essentially begin and end a transaction, a set of changes that are atomically merged into the file later. These seem to be addressing issues of concurrency more than of parallelism. They also discuss implementation somewhat, mentioning the use of distributed caches and log-structured disk layout. Prototype in Linda (!).} } @InProceedings{broom:impl, author = {Bradley M. Broom}, title = {Implementation and Performance of the {Acacia} File System}, booktitle = {Proceedings of the Second Fujitsu-ANU CAP Workshop}, year = {1991}, month = {November}, keyword = {distributed file system, multiprocessor file system, pario-bib}, comment = {See also broom:acacia, lautenbach:pfs, mutisya:cache, and broom:cap. This paper is a very sketchy overview of those; it is better to read them.} } @InProceedings{broom:perf, author = {Bradley M. Broom}, title = {Performance Measurement of the {Acacia} Parallel File System for the {AP1000} Multicomputer}, booktitle = {Proc. Second Parallel Computing Workshop}, year = {1993}, month = {November}, pages = {{P1-F-1} to {P1-F-11}}, publisher = {Fujitsu Parallel Computing Research Facilities, Fujitsu Laboratories Ltd.}, address = {Kawasaki, Japan}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, comment = {They evaluate the performance of Acacia with some simple synthetic benchmarks. Performance limited by implementation problems in the sequential file system. Otherwise no real surprises.} } @InProceedings{browne:io-arch, author = {J. C. Browne and A. G. Dale and C. Leung and R. Jenevein}, title = {A Parallel Multi-Stage {I/O} Architecture with Self-managing Disk Cache for Database Management Applications}, booktitle = {Proceedings of the Fourth International Workshop on Database Machines}, year = {1985}, month = {March}, publisher = {Springer-Verlag}, keyword = {parallel I/O, disk caching, database, pario-bib}, comment = {A fancy interconnection from procs to I/O processors, intended mostly for DB applications, that uses cache at I/O end and a switch with smarts. Cache is associative. Switch helps out in sort and join operations.} } @TechReport{bruce:chimp, author = {R. A. A. Bruce and S. R. Chapple and N. B. MacDonald and A. S. Trew}, title = {{CHIMP} and {PUL}: Support for Portable Parallel Programming}, year = {1993}, month = {March}, number = {EPCC-TR93-07}, institution = {Edinburgh Parallel Computing Center}, URL = {file://ftp.epcc.ed.ac.uk/pub/pul/chimp-pul-overview.ps}, keyword = {parallel programming, parallel I/O, pario-bib}, comment = {An overview of the CHIMP message-passing library and the PUL set of libraries. Key design goal is portability; they run on many systems. PUL includes PUL-GF, which supports parallel access to files (see chapple:pulgf, chapple:pulgf-adv, and chapple:pario). Other PUL libraries support grids and meshes, global communications, and task farms. Contact pul@epcc.ed.ac.uk.} } @InProceedings{brunet:factor, author = {Jean-Philippe Brunet and Palle Pedersen and S.~Lennart Johnsson}, title = {Load-Balanced {LU} and {QR} Factor and Solve Routines for Scalable Processors with Scalable {I/O}}, booktitle = {Proceedings of the 17th IMACS World Congress}, year = {1994}, month = {July}, address = {Atlanta, GA}, note = {Also available as Harvard University Computer Science Technical Report TR-20-94.}, URL = {ftp://das-ftp.harvard.edu/techreports/tr-20-94.ps.gz}, keyword = {parallel I/O, linear algebra, out-of-core, pario-bib}, abstract = {The concept of block-cyclic order elimination can be applied to out-of-core $LU$ and $QR$ matrix factorizations on distributed memory architectures equipped with a parallel I/O system. This elimination scheme provides load balanced computation in both the factor and solve phases and further optimizes the use of the network bandwidth to perform I/O operations. Stability of LU factorization is enforced by full column pivoting. Performance results are presented for the Connection Machine system CM-5.}, comment = {Short, not many details. Performance results shows about 3.5 Gflops for all problem sizes, both in-core on small N and out-of-core on large N.} } @Article{cabrera:pario, author = {Luis-Felipe Cabrera and Darrell D. E. Long}, title = {Swift: {Using} Distributed Disk Striping to Provide High {I/O} Data Rates}, journal = {Computing Systems}, year = {1991}, month = {Fall}, volume = {4}, number = {4}, pages = {405--436}, earlier = {cabrera:pariotr}, keyword = {parallel I/O, disk striping, distributed file system, pario-bib}, comment = {See cabrera:swift, cabrera:swift2. Describes the performance of a Swift prototype and simulation results. They stripe data over multiple disk servers (here SPARC SLC with local disk), and access it from a SPARC2 client. Their prototype gets nearly linear speedup for reads and asynchronous writes; synchronous writes are slower. They hit the limit of the Ethernet and/or the client processor with three disk servers. Adding another Ethernet allowed them to go higher. Simulation shows good scaling. Seems like a smarter implementation would help, as would special- purpose parity-computation hardware. Good arguments for use of PID instead of RAID, to avoid a centralized controller that is both a bottleneck and a single point of failure.} } @TechReport{cabrera:pariotr, author = {Luis-Felipe Cabrera and Darrell D. E. Long}, title = {Swift: {Using} Distributed Disk Striping to Provide High {I/O} Data Rates}, year = {1991}, number = {CRL-91-46}, institution = {UC Santa Cruz}, later = {cabrera:pario}, URL = {ftp://ftp.cse.ucsc.edu/pub/tr/ucsc-crl-91-46.ps.Z}, keyword = {parallel I/O, disk striping, distributed file system, pario-bib} } @TechReport{cabrera:stripe, author = {Luis-Felipe Cabrera and Darell D. E. Long}, title = {Using Data Striping in a Local Area Network}, year = {1992}, month = {March}, number = {UCSC-CRL-92-09}, institution = {Univ. California at Santa Cruz}, URL = {ftp://ftp.cse.ucsc.edu/pub/tr/ucsc-crl-92-09.ps.Z}, keyword = {striping, parallel I/O, distributed system, pario-bib}, comment = {See cabrera:swift2, cabrera:swift, cabrera:pario. Not much new here. Simulates higher-performance architectures. Shows reasonable scalability. Counts 5 inst/byte for parity computation.} } @TechReport{cabrera:swift, author = {Luis-Felipe Cabrera and Darrell D. E. Long}, title = {Swift: A Storage Architecture fo Large Objects}, year = {1990}, number = {UCSC-CRL-89-04}, institution = {U.C. Santa Cruz}, later = {cabrera:swift2}, URL = {ftp://ftp.cse.ucsc.edu/pub/tr/ucsc-crl-89-04.tar}, keyword = {parallel I/O, disk striping, distributed file system, multimedia, pario-bib}, comment = {See cabrera:swift2. A brief outline of a design for a high-performance storage system, designed for storing and retrieving large objects like color video or visualization data at very high speed. They distribute data over several ``storage agents'', which are some form of disk or RAID. They are all connected by a high-speed network. A ``storage manager'' decides where to spread each file, what kind of reliability mechanism is used. User provides preallocation info such as size, reliability level, data rate requirements, and so forth.} } @InProceedings{cabrera:swift2, author = {Luis-Felipe Cabrera and Darell D. E. Long}, title = {Exploiting Multiple {I/O} Streams to Provide High Data-Rates}, booktitle = {Proceedings of the 1991 Summer USENIX Technical Conference}, year = {1991}, pages = {31--48}, earlier = {cabrera:swift}, keyword = {parallel I/O, disk striping, distributed file system, multimedia, pario-bib}, comment = {See also cabrera:swift. More detail than the other paper. Experimental results from a prototype that stripes files across a distributed file system. Gets almost linear speedup in certain cases. Much better than NFS. Simulation to extend it to larger systems.} } @Article{cao:jtickertaip, author = {Pei Cao and Swee Boon Lim and Shivakumar Venkataraman and John Wilkes}, title = {The {TickerTAIP} parallel {RAID} architecture}, journal = {ACM Transactions on Computer Systems}, year = {1994}, month = {August}, volume = {12}, number = {3}, pages = {236--269}, publisher = {ACM Press}, earlier = {cao:tickertaip}, keyword = {parallel I/O, RAID, pario-bib}, comment = {See cao:tickertaip-tr2.} } @InProceedings{cao:tickertaip, author = {Pei Cao and Swee Boon Lim and Shivakumar Venkataraman and John Wilkes}, title = {The {TickerTAIP} parallel {RAID} architecture}, booktitle = {Proceedings of the 20th Annual International Symposium on Computer Architecture}, year = {1993}, pages = {52--63}, earlier = {cao:tickertaip-tr2}, later = {cao:jtickertaip}, keyword = {parallel I/O, RAID, pario-bib}, comment = {Superceded by cao:tickertaip-tr2 and cao:jtickertaip.} } @TechReport{cao:tickertaip-tr, author = {Pei Cao and Swee Boon Lim and Shivakumar Venkataraman and John Wilkes}, title = {The {TickerTAIP} parallel {RAID} architecture}, year = {1992}, month = {December}, number = {HPL-92-151}, institution = {HP Labs}, later = {cao:tickertaip-tr2}, keyword = {parallel I/O, RAID, pario-bib}, comment = {A parallelized RAID architecture that distributes the RAID controller operations across several worker nodes. Multiple hosts can connect to different workers, allowing multiple paths into the array. The workers then communicate on their own fast interconnect to accomplish the requests, distributing parity computations across multiple workers. They get much better performance and reliability than plain RAID. They built a prototype and a performance simulator. Two-phase commit was needed for request atomicity, and a request sequencer was needed for serialization. Also found it was good to give the whole request info to all workers and to let them figure out what to do and when. Superceded by cao:tickertaip-tr2 and cao:tickertaip.} } @TechReport{cao:tickertaip-tr2, author = {Pei Cao and Swee Boon Lim and Shivakumar Venkataraman and John Wilkes}, title = {The {TickerTAIP} parallel {RAID} architecture}, year = {1993}, month = {April}, number = {HPL-93-25}, institution = {HP Labs}, earlier = {cao:tickertaip-tr}, later = {cao:tickertaip}, keyword = {parallel I/O, RAID, pario-bib}, comment = {Revised version of cao:tickertaip, actually: ``It's the ISCA paper with some text edits plus some new results on what happens if you turn disk request-scheduling on. It's been sent to TOCS.''. Thus it supercedes both cao:tickertaip-tr and cao:tickertaip. Eventually published as cao:jtickertaip.} } @InProceedings{carey:shore, author = {Michael J. Carey and David J. DeWitt and Michael J. Franklin and Nancy E. Hall and Mark L. McAuliffe and Jeffrey F. Naughton and Daniel T. Schuh and Marvin H. Solomon and C. K. Tan and Odysseas G. Tsatalos and Seth J. White and Michael J. Zwilling}, title = {Shoring Up Persistent Applications}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data}, year = {1994}, pages = {383--394}, publisher = {ACM Press}, keyword = {persistent systems, database, parallel I/O, object-oriented, pario-bib}, comment = {SHORE is a persistent object database system. It is intended for parallel or distributed systems, and attempts to combine both DB and file system features. Everything in the database is a typed object, in that there is a registered interface object that defines this type, including the basic data types of elements of the object, and methods that manipulate the object. Every object has an OID, and objects can refer to other objects with the OID. But they also support unix-like namespace, in which the names refer to objects by giving the OID. They also have a unix-compatibility library that provides access to many objects through the unix file interface. Every node has a SHORE server, and applications talk to their local server for all their needs. The local server talks to other servers as needed. The servers are also responsible for caching pages and managing locks and transactions.} } @TechReport{carretero:case, author = {J. Carretero and F. {P\'erez} and P. de~Miguel and F. {Garc\'{\i}a} and L. Alonso}, title = {Implementation of a Parallel File System: {CCFS} a Case of Study}, year = {1994}, number = {FIM/84.1/DATSI/94}, institution = {Universidad Politecnic Madrid}, address = {Madrid, Spain}, URL = {http://laurel.datsi.fi.upm.es/~gp/publications/datsi84.1.ps.Z}, keyword = {multiprocessor file system, parallel I/O, pario-bib}, abstract = {This document briefly describes the components of the Cache Coherent File System (CCFS) source code. CCFS has three main components: Client File Server (CLFS), Local File Server (LFS), Concurrent Disk System (CDS). The main modules and functions of each component are described here. Special emphasys has been put on interfaces and data structures.}, comment = {See carretero:*, rosales:cds, perez:clfs.} } @TechReport{carretero:concepts, author = {J. Carretero and F. {P\'erez} and P. de~Miguel and F. {Garc\'{\i}a} and L. Alonso}, title = {Multicomputer Parallel File Systems Design Concepts: {CCFS} a case of study}, year = {1994}, number = {FIM/79.1/DATSI/94}, institution = {Universidad Politecnic Madrid}, address = {Madrid, Spain}, URL = {http://laurel.datsi.fi.upm.es/~gp/publications/datsi79.1.ps.Z}, keyword = {multiprocessor file system, parallel I/O, pario-bib}, comment = {See carretero:*, rosales:cds, perez:clfs.} } @Article{carretero:evaluation, author = {J. Carretero and F. {P\'erez} and P. de~Miguel and F. {Garc\'{\i}a} and L. Alonso}, title = {A Multiprocessor Parallel Disk System Evaluation}, journal = {Decentralized and Distributed Systems}, year = {1993}, month = {September}, publisher = {North Holland}, note = {IFIP Transactions A-39}, URL = {http://laurel.datsi.fi.upm.es/~gp/publications/first_esprit.ps.Z}, keyword = {multiprocessor file system, parallel I/O, pario-bib}, abstract = {This paper presents a Parallel Disk System (PDS) for general purpose multiprocessors, which provides support for conventional file systems and databases, as well as direct access for applications requiring high performance mass storage. We present a systematic method to characterize a parallel I/O system, using it to evaluate PDS and to identify an optimal PDS configuration. Several devices (single disk, Raid3 and Raid5), and different configurations of I/O nodes, each one with a different type of device, have been simulated. Throughput and I/O rate of each configuration have been obtained for the former configurations and different types of workloads (database, general purpose and scientific applications).}, comment = {See carretero:*, rosales:cds, perez:clfs.} } @TechReport{carretero:lfs, author = {J. Carretero and F. {P\'erez} and P. de~Miguel and F. {Garc\'{\i}a} and L. Alonso}, title = {{LFS} Design: A Parallel File Server for Multicomputers}, year = {1994}, number = {FIM/81.1/DATSI/94}, institution = {Universidad Politecnic Madrid}, address = {Madrid, Spain}, URL = {http://laurel.datsi.fi.upm.es/~gp/publications/datsi81.1.ps.Z}, keyword = {multiprocessor file system, parallel I/O, pario-bib}, abstract = {This document describes the detailed design of the LFS, one of the components of the Cache Coherent File System (CCFS). CCFS has three main components: Client File Server (CLFS), Local File Server (LFS), Concurrent Disk System (CDS). The Local File Servers are located on each disk node, to develop file server functions in a per node basis. The LFS will interact with the Concurrent Disk System (CDS) to execute real input/output and to manage the disk system, partitions, distributed partitions, etc. The LFS includes general file system services and specialized services, and it will be responsible of maintaining cache consistency, distributing accesses to other servers, controlling partition information, etc.}, comment = {See carretero:*, rosales:cds, perez:clfs.} } @InProceedings{carretero:mapping, author = {J. Carretero and F. P\'{e}rez and P. {de Miguel} and F. Garc\'{\i}a and L. Alonso}, title = {{I/O} Data Mapping in {{\em ParFiSys:}} Support for High-Performance {I/O} in Parallel and Distributed Systems}, booktitle = {Euro-Par~'96}, year = {1996}, month = {August}, series = {Lecture Notes in Computer Science}, volume = {1123}, pages = {522--526}, publisher = {Springer-Verlag}, URL = {http://laurel.datsi.fi.upm.es/~gp/publications/europar96.ps.Z}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, abstract = {This paper gives an overview of the I/O data mapping mechanisms of {\em ParFiSys}. Grouped management and parallelization are presented as relevant features. I/O data mapping mechanisms of {\em ParFiSys}, including all levels of the hierarchy, are described in this paper.} } @Article{carretero:parfisys, author = {J. Carretero and F. {P\'erez} and P. de~Miguel and F. {Garc\'{\i}a} and L. Alonso}, title = {{ParFiSys}: A Parallel File System for {MPP}}, journal = {ACM Operating Systems Review}, year = {1996}, month = {April}, volume = {30}, number = {2}, pages = {74--80}, keyword = {multiprocessor file system, parallel I/O, pario-bib}, comment = {See carretero:*, rosales:cds, perez:clfs.} } @TechReport{carretero:posix, author = {J. Carretero and F. {P\'erez} and P. de~Miguel and F. {Garc\'{\i}a} and L. Alonso}, title = {Prototype {POSIX}-Style Parallel File Server and Report for the {CS-2}}, year = {1993}, number = {D1.7/1}, institution = {Universidad Politecnic Madrid}, address = {Madrid, Spain}, URL = {http://laurel.datsi.fi.upm.es/~gp/publications/first_esprit.ps.Z}, keyword = {multiprocessor file system, parallel I/O, pario-bib}, comment = {See carretero:*, rosales:cds, perez:clfs.} } @TechReport{carretero:posix-final, author = {J. Carretero and F. {P\'erez} and P. de~Miguel and F. {Garc\'{\i}a} and L. Alonso}, title = {{POSIX}-Style Parallel File Server for the {GPMIMD}: Final Report}, year = {1995}, number = {D1.7/2}, institution = {Universidad Politecnic Madrid}, address = {Madrid, Spain}, URL = {http://laurel.datsi.fi.upm.es/~gp/publications/final_esprit.ps.Z}, keyword = {multiprocessor file system, parallel I/O, pario-bib}, comment = {See carretero:*, rosales:cds, perez:clfs.} } @Article{carretero:subsystem, author = {J. Carretero and F. {P\'erez} and P. de~Miguel and F. {Garc\'{\i}a} and L. Alonso}, title = {A Massively Parallel and Distributed {I/O} Subsystem}, journal = {Computer Architecture News}, year = {1996}, month = {June}, volume = {24}, number = {3}, pages = {1--8}, keyword = {parallel I/O, I/O architecture, pario-bib}, comment = {See carretero:*, rosales:cds, perez:clfs.} } @TechReport{carter:benchmark, author = {Russell Carter and Bob Ciotti and Sam Fineberg and Bill Nitzberg}, title = {{NHT-1} {I/O} Benchmarks}, year = {1992}, month = {November}, number = {RND-92-016}, institution = {NAS Systems Division, NASA Ames}, URL = {http://www.nas.nasa.gov/NAS/TechReports/RNDreports/RND-92-016/RND-92-016.html}, keyword = {parallel I/O, benchmark, pario-bib}, comment = {Specs for three scalable-I/O benchmarks to be used for evaluating I/O for multiprocessors. One measures application I/O by mixing I/O and computation, one measures max disk I/O by reading and writing 80\% of the total RAM memory, and the last one is for sending that data from the file system, through the network, and back. See fineberg:nht1.} } @InProceedings{catania:array, author = {v. Catania and A. Puliafito and S. Riccobene and L. Vita}, title = {Performance Evaluation of a Partial Dynamic Declustering Disk Array System}, booktitle = {Proceedings of the Third IEEE International Symposium on High Performance Distributed Computing}, year = {1994}, month = {August}, pages = {244--252}, keyword = {parallel I/O, disk array, pario-bib}, abstract = {With a view to improving the performance and the fault tolerance of mass storage units, this paper concentrates on the architectural issues of parallelizing I/O access and a disk array system by means of definition of a new, particularly flexible architecture, called Partial Dynamic Declustering, which is fault-tolerant and offers higher levels of performance and reliability than the solutions normally used. A fast distributed algorithm based on a dynamic structure and usable for the implementation of an efficient I/O subsystem manager is proposed. Particular attention is also paid to the definition of analytical models based on Stochastic Reward Petri nets in order to analyze the performance and reliability of the system proposed.} } @Article{catania:disk-array, author = {V. Catania and A. Puliafito and S. Riccobene and L. Vita}, title = {Design and Performance Analysis of a Disk Array System}, journal = {IEEE Transactions on Computers}, year = {1995}, month = {October}, volume = {44}, number = {10}, pages = {1236--1247}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, disk array, pario-bib}, abstract = {We concentrate on the architectural issues of parallelizing I/O access in a disk array system by means of definition of a new, particularly flexible architecture, called partial dynamic declustering, which is fault-tolerant and offers higher levels of performance and reliability than the solutions normally used. A simulation analysis highlights the efficiency of the proposed solution in balancing the file system workload and demonstrates its validity in both cases of unbalanced loads and expansion of the system. Particular attention is also paid to the definition of analytical models, based on stochastic reward nets, in order to analyze the performance and reliability of the system. The response time distribution function is evaluated and a specific performance analysis with varying degrees of declustering and workload is carried out.} } @InProceedings{chandy:array, author = {John A. Chandy and Prithviraj Benerjee}, title = {Reliability Evaluation of Disk Array Architectures}, booktitle = {Proceedings of the 1993 International Conference on Parallel Processing}, year = {1993}, pages = {I--263--267}, publisher = {CRC Press}, address = {St. Charles, IL}, keyword = {parallel I/O, disk array, pario-bib, RAID}, comment = {A framework for evaluating the reliability of RAIDs. They consider failure and repair rates that depend on the workload.} } @TechReport{chao:datamesh, author = {Chia Chao and Robert English and David Jacobson and Bart Sears and Alexander Stepanov and John Wilkes}, title = {{DataMesh} architecture 1.0}, year = {1992}, month = {December}, number = {HPL-92-153}, institution = {HP Labs}, earlier = {wilkes:datamesh}, URL = {http://www.hpl.hp.com/personal/John_Wilkes/papers/HPL-92-153.ps.Z}, keyword = {parallel I/O, parallel file system, pario-bib}, comment = {A more detailed spec of the datamesh architecture, specifying components and operations. It is a block server where blocks are associatively addressed by tags. Some search operations are supported, as are atomic tag-changing operations. See also cao:tickertaip, wilkes:datamesh1, wilkes:datamesh, wilkes:houses, wilkes:lessons.} } @Manual{chapple:pario, author = {S. R. Chapple and R. A. Fletcher}, title = {{PUL-GF} Parallel {I/O} Concepts}, year = {1993}, month = {February}, organization = {Edinburgh Parallel Computing Center}, note = {EPCC-KTP-PUL-GF-PROT-CONC 1.0}, URL = {file://ftp.epcc.ed.ac.uk/pub/pul/concepts-i.ps}, keyword = {parallel I/O, pario-bib}, comment = {See also bruce:chimp, chapple:pulgf, and chapple:pulgf-adv, for general information on CHIMP and PUL-GF. This document is an exploration of the potential ways to parallelize the underlying I/O support for the PUL-GF interface. They reason about tradeoffs in the number of servers, disks, and clients, but (as they note) without any performance evaluation to back it up. In particular, they argue that there should be one partition per disk, one server per disk, and probably one client to many servers, or many clients to many servers. A key assumption is that a traditional serial file system is the home location for files, and that files are ``converted'' into parallel files (or vice versa) by replicating or distributing them. Application could choose the number of servers (and hence disks) for each file. Hints could be provided about many things. Interesting idea to allow user hooks for cache prefetch and writeback functions. Support for variable-length records (``atoms'') is a key component. Segments of a file with different formats, e.g., a header and a matrix, may be separated into different components when the file is distributed into parallel form. See chapple:pulpf for info on the eventual realization of these ideas.} } @Manual{chapple:pulgf, author = {S. R. Chapple and S. M. Trewin}, title = {{PUL-GF} Prototype User Guide}, year = {1993}, month = {February}, organization = {Edinburgh Parallel Computing Center}, note = {EPCC-KTP-PUL-GF-UG 0.1}, URL = {file://ftp.epcc.ed.ac.uk/pub/pul/gf-prot-ug.ps}, keyword = {parallel I/O, pario-bib}, comment = {PUL is a set of libraries that run on top of the CHIMP portable message-passing library (see bruce:chimp). One of the PUL libraries is PUL-GF, to support file I/O. The underlying I/O support is not parallel (but see chapple:pario). The interface is parallel, however; in particular, it supports file modes similar to those used in many systems, which they call single, multi, random, and independent. Formatted and unformatted, synchronous and asynchronous. Very general multidimensional-array read and write functions. Ability to group multiple I/O requests into atomic units, though not a full transaction capability. See also chapple:pulgf-adv and chapple:pario.} } @Manual{chapple:pulgf-adv, author = {S. R. Chapple}, title = {{PUL-GF} Prototype Advanced User Guide}, year = {1993}, month = {January}, organization = {Edinburgh Parallel Computing Center}, note = {EPCC-KTP-PUL-GF-PROT-ADV-UG 0.1}, URL = {file://ftp.epcc.ed.ac.uk/pub/pul/gf-prot-adv-ug.ps.Z}, keyword = {parallel I/O, pario-bib}, comment = {See chapple:pulgf for a definition of PUL-GF. This document describes the internal client-server interface to PUL-GF, including ways that users can extend the functionality of PUL-GF. In particular, they give an example of how a new file format (a run-length encoded 2-d matrix) can be read and written transparently as if it were a plain matrix file. The extensibility is offered by run-time registration of user-defined interposition functions, to be called at key moments in the processing of a file I/O request. See also bruce:chimp and chapple:pario.} } @Manual{chapple:pulpf, author = {S. R. Chapple}, title = {{PUL-PF} Reference Manual}, year = {1994}, month = {January}, organization = {Edinburgh Parallel Computing Center}, note = {EPCC-KTP-PUL-PF-PROT-RM 1.1}, URL = {file://ftp.epcc.ed.ac.uk/pub/pul/pf-prot-rm.ps}, keyword = {parallel I/O, multiprocessor file system interface, pario-bib}, comment = {See also chapple:pulgf and chapple:pario. An evolution of their parallel I/O interface. PUL-PF is a library on top of existing file systems. Every process is either a client or a server; servers write some portion of the file to a file in the file system. Servers can be divided into groups so that files need not be spread across all servers. There seems to be client caching, with consistency controlled differently depending on access mode; when necessary, the application must call get-token and send-token commands to serialize access to an atom. Independently of their single, multi, rando