%
% (Compiled by) 1993
%
%   Wolfgang Schreiner <Wolfgang.Schreiner at risc.uni-linz.ac.at>
%   Research Institute for Symbolic Computation (RISC-Linz)
%   Johannes Kepler University, A-4040 Linz, Austria
%
% PLEASE DO NOT REMOVE THIS NOTICE.
%

@inproceedings{Abra85,
author = {Abramski, Samson},
title = {{Strictness Analysis and Polymorphic Invariance (Extended Abstract)}},
booktitle = {Programs as Data Objects, Proceedings of a Workshop},
address = {Copenhagen, Denmark, October 17--19},
year = {1985},
volume = {217},
series = {Lecture Notes in Computer Science},
publisher = {Springer, Berlin},
editor = {Ganzinger, H. and Jones, N. D.},
pages = {1--23},
abstract = {In this paper an approach to strictness analysis for polymorphic
functions is developed.},
keywords = {Strictness Analysis, Abstract Interpretation},
}

@techreport{ACM88,
author = {Arvind and Culler, David E. and Maa, Gino K.},    
title = {{Assessing the Benefits of Fine-grained Parallelism in Dataflow
Programs}},
type = {Computation Structures Group Memo},
number = {279},
institution = {Laboratory for Computer Science},
address = {Massachusetts Institute of Technology, Cambridge, MA},
month = {March},
year = {1988},
abstract = {A method for assessing the benefits of fine-grain parallelism in
``real'' programs is presented. The method is based on {\em parallelism
profiles\/} and {\em speedup curves\/} derived by executing dataflow graphs on
an interpreter under progressively more realistic assumptions about processor
resources and communication costs. It is shown that programs, even using
traditional algorithms, exhibit ample parallelism when parallelism is exposed
at all levels, i.e., within expressions, across nested loops and function
calls, and in producer-consumer relationships on individual elements of data
structures. Since we only consider dataflow graphs compiled from the
high-level language Id, the bias introduced by the language and the compiler 
is examined. A method of estimating speedup trough analysis of the ideal
parallelism profile is developed, avoiding repeated execution of programs. It
is shown that fine-grain parallelism can be used to mask large, unpredictably
memory latency and synchronization waits in architectures employing dataflow
instruction execution mechanisms. Finally, the effects of grouping portions of
dataflow programs, such as function invocations or loop iterations, and
requiring that the operators in a group execute on a single processor, are
explored.}, 
keywords = {Dataflow, Id, Parallelism Profiles},
}

@inproceedings{AHN88,
author = {Arvind and Heller, Steve and Nikhil, Rishiyur S.},
title = {{Programming Generality and Parallel Computers}},
booktitle = {Fourth International Symposium on Biological and Artificial
Intelligence Systems},
address = {Trento, Italy, September 18--22},
year = {1988},
note = {Also: Computation Structures Group Memo 287, Laboratory for Computer
Science, Massachusetts Institute of Technology, Cambridge, MA},
abstract = {In recent years, many ``parallel, general-purpose'' computers have
become available, using various interconnections of processors and memories.
However, extensions to conventional languages to program these machines have
made programming significantly more complex --- a clear regression in software
technology. The user now has the additional problem of partitioning his
program into parallel parts, and often this is very machine-specific. Further,
it is no longer easy to write determinate programs, making debugging a
nightmare. In this paper we present an alternative approach. Id is a
declarative, implicitly parallel language that simultaneously raises the level
of programming and reveals much more parallelism than is possible with
programmer annotations. Determinacy is guaranteed by the language semantics.
We demonstrate this by developing a program in Id for a non-trivial problem
called the ``paraffins problem'', and examining the available parallelism,
after the fact.},
keywords = {Dataflow, Id, I-Structures},
}

@techreport{Alli87,
author = {Allison, Lloyd},
title = {{Two Functional Programming Techniques: Continuations, Circular
Programs}},
institution = {Department of Computer Science},
address = {Monash University, Clayton, Australia},
month = {January},
year = {1987},
number = {87/91},
abstract = {Continuations are used in denotational semantics to describe
control commands such as jumps. Here it is shown how they can be used as a
programming technique to simulate backtracking and coroutines. A {\em
circular program\/} creates a data structure where the computation of new
components depends on existing components. Examples arise where a list is to
be created in which later elements depend on earlier ones. Such programs can
avoid creating garbage in the form of intermediate lists, bringing some of the
efficiency of imperative programming to functional programming. The technique
can sometimes be used in an imperative language to create an elegant program
in a functional style.},
keywords = {Continuation, Backtracking, Coroutine, Non-Determinism},
}

@article{Alli89a,
author = {Allison, Lloyd},
title = {{Circular Programs and Self-referential Structures}},
journal = software,
volume = {19},
number = {2},
pages = {99-109},
year = {1989},
abstract = {A {\em circular program\/} creates a data structure whose
computation depends upon itself or refers to itself. The technique is used to
omplement the classic data structures circular and doubly-linked lists,
threaded trees and queues, in a functional programming language. These
structures are normally thought to require updateable variables found in
imperative languages. For example, a functional program to perform the
breadth-first traversal of a tree is given. Some of the examples result in
circular data structures when evaluated. Some examples are particularly
space-efficient by avoiding the creation of intermediate temporary structures
which would otherwise later become garbage. Lastly, the technique can be
applied in an imperative language to give an elegant program.},
keywords = {Circular Program},
}

@techreport{Alli89b,
author = {Allison, Lloyd},
title = {{Applications of Recursively Defined Data Structures}},
institution = {Department of Computer Science},
address = {Monash University, Clayton, Australia},
month = {February},
year = {1989},
number = {88/119},
abstract = {A {\em circular program\/} contains a data structure whose
definition is self-referential or recursive. Use of such definition allows
efficient functional programs to be written that would not otherwise be
possible. A novel primes program and applications to the Fibonacci numbers, to
Ackermann's function and to constraint-satisfaction problems are given.},
keywords = {Circular Programs, Streams},
}

@inproceedings{ANP87,
author = {Arvind and Nikhil, Rishiyur S. and Pingali, Keshav K.},
title = {{I-Structures: Data Structures for Parallel Computing}},       
booktitle = {Graph Reduction, Proceedings of a Workshop},
address = {Santa Fe, New Mexico, September 29 -- October 1},
year = {1986},
volume = {279},
series = {Lecture Notes in Computer Science},
editor = {Fasel, Joseph H. and Keller, Robert M.},
publisher = {Springer, Berlin},
pages = {336--369},
note = {Also: Computation Structures Group Memo 269, Laboratory for Computer
Science, Massachusetts Institute of Technology, Cambridge, MA},
abstract = {It is difficult simultaneously to achieve elegance, efficiency and
parallelism in functional programs that manipulate large data structures. We
demonstrate this through careful analysis of program examples using three
common functional data-structuring approaches --- lists using {\tt Cons} and
arrays using {\tt Update} (both fine-grained operators), and arrays using {\tt
make-array} (a ``bulk'' operator). We then present I-structures as an
alternative, defining precisely the parallel operational semantics of Id, a
language with I-structures. We show elegant, efficient and parallel solutions
for the program examples in Id. I-structures make the language non-functional,
but do not raise determinacy issues. Finally, we show that even in the context
of purely functional languages, I-structures are invaluable for implementing
functional data abstractions.},
keywords = {Dataflow, I-structures},
}

@inproceedings{ArAr89,
author = {Ariola, Zena and Arvind},
title  = {{P-TAC: A Parallel Intermediate Language}}, 
booktitle = {{FPCA '89, Functional
Programming Languages and Computer Architecture}},
address = {London, UK, September 11--13},
year = {1989},
pages = {230--242},
publisher = {ACM Press, New York},
abstract = {P-TAC is an intermediate-level language designed to capture the
sharing of computation. It is a more suitable internal language for functional
language compilers than the {\em $\lambda$-calculus or combinators\/},
especially for compiler optimizations. Using P-TAC, a proof for the confluence
of Id, a higher-order functional language augmented with I-structures, is
given. Using the notion of observational congruence the correctness of some
compiler optimizations is shown.},
keywords = {Compilation, Target Language},
}

@techreport{ArCu87,
author = {Arvind and Culler, David E.},    
title = {{Resource Requirements of Dataflow Programs}},                      
type = {Computation Structures Group Memo},
number = {280},
institution = {Laboratory for Computer Science},
address = {Massachusetts Institute of Technology, Cambridge, MA},
month = {December},
year = {1987},
abstract = {Parallel execution of programs requires more resources and more
complex resource management than sequential execution. If concurrent tasks can
be spawned dynamically, programs may require an inordinate amount of resources
when the potential parallelism in the program is much greater than the amount
of parallelism the machine can exploit. We describe {\em loop bounding\/}, a
technique for dynamically controlling the amount of parallelism exposed in
dataflow programs. The effectiveness of the technique in reducing token
storage requirements is supported by experimental data in the form of {\em
parallelism profiles\/} and {\em waiting-token profiles\/}. Comparisons are
made throughout with more conventional approaches to parallel computing.},
keywords = {Dataflow, Resource Allocation, Parallelism Profiles},
}

@techreport{ArEk87,
author = {Arvind and Ekanadham, Kattamuri},                 
title = {{Future Scientific Programming on Parallel Machines}},         
type = {Computation Structures Group Memo},
number = {272},
institution = {Laboratory for Computer Science},
address = {Massachusetts Institute of Technology, Cambridge, MA},
month = {March},
year = {1987},
abstract = {A language for large scientific applications should facilitate
encoding and debugging of programs at the highest level possible. At the same
time it should facilitate generation of efficient code for parallel machines.
Often these two requirements are conflicting, and trade-offs must be made.
Functional and other declarative languages offer relief on both counts. The
use of higher-order functions, especially in curried forms, can raise the
level of programming dramatically. In addition, such languages often have
straightforward operation semantics, thereby providing tremendous
opportunities for parallel execution. Programs written in declarative
languages thus eliminate the problem of ``detecting parallelism''. This paper
illustrates programming in one such language, Id Nouveau, and contrasts it
with programming in Fortran. Using an excerpt from an application known as
Simple, it is shown how a program can be composed in Id Nouveau from small
functions that directly relate to the mathematical and physical concepts of
the problem. The difficulty of expressing these concepts in Fortran is
discussed. Finally, it is shown that by performing simple transformations,
such as in-line substitution of functions, the resulting Id Nouveau code
becomes as efficient as an equivalent Fortran program written to run
efficiently on a parallel machine.},
keywords = {Dataflow, Id, Scientific Programming},
}

@inproceedings{ArNi87,
author = {Arvind and Nikhil, R. S. },
title = {{Executing a Program on the MIT Tagged-Token Dataflow Architecture}},
booktitle = {PARLE '87, Parallel Architectures and Languages Europe, Volume 2:
Parallel Languages}, 
address = {Eindhoven, The Netherlands, June 15--19},
year = {1987},
volume = {259},
editor = {de Bakker, J. W. and Nijman, A. J. and Treleaven, P. C.},
series = {Lecture Notes in Computer Science},
publisher = {Springer, Berlin},
note = {Also: Computation Structures Group Memo 271, Laboratory for Computer
Science, Massachusetts Institute of Technology, Cambridge, MA},
abstract = {The MIT Tagged-Token Dataflow project has an unconventional, but
integrated approach to general-purpose high-performance parallel computing.
Rather than extending conventional sequential languages, we use Id, a
high-level language with fine-grained parallelism and determinacy implicit in
its operational semantics. Id programs are compiled to dynamic dataflow
graphs, a parallel machine language. Dataflow graphs are directly executed on
the MIT Tagged-Token Dataflow Architecture (TTDA), a novel multiprocessor
architecture. Dataflow research has advanced significantly in the last few
years; in this paper we provide an overview of our current thinking, by
describing example Id programs, their compilation to dataflow graphs, and
their execution on the TTDA. Finally, we describe related work and the status
of our project.},
keywords = {Compilation, Dataflow, Id},
}

@inproceedings{AbSy85,
author = {Abramski, S. and Sykes, R.},
title = {{Secd-m: A Virtual Machine for Applicative Programming}},
editor = {Jouannaud, Jean-Pierre},
booktitle = {{FPCA '85, Functional Programming Languages and Computer
Architecture}},
address = {Nancy, France, September 16--19},
year = {1985},
pages = {81--98},
series = {Lecture Notes in Computer Science},
volume = {201},
publisher = {Springer, Berlin},
abstract = {We present a virtual machine to support {\em applicative
multiprogramming\/} --- the description of concurrent, asynchronous systems
such as operating systems in a functional style. The machine extend's Landin's
secd machine to support multiple concurrent expression evaluation,
non-determinism in the form of the fair merge, and a full range of input and
output devices. This allows system programs to be written in a functional
style. The secd-m machine has been implemented and a number of functional
concurrent programs demonstrated.},
keywords = {SECD, Abstract Machine, Operating System},
}

@article{AsWa77,
author = {Ashcroft, E. A. and Wadge, W. W.},
title = {{Lucid, a Nonprocedural Language with Iteration}},
journal = cacm,
volume = {20},
number = {7},
month = {July},
year = {1977},
pages = {519--526},
abstract = {Lucid is a formal system in which programs can be written and
proofs of programs carried out. The proofs are particularly easy to follow and
straight-forward to produce because the statements in a Lucid program are
simply axioms from which the proof proceeds by (almost) conventional logic
reasoning, with the help of a few axioms and rules of inference for the
special lucid functions. As a programming language, Lucid is unconventional
because, among other things, the order of statements is irrelevant and
assignment statements are equations. Nevertheless, Lucid programs need not
look much different than iterative programs in a conventional structured
programming language using assignment and conditional statements and loops.},
keywords = {Dataflow, Lucid, Formal Systems, Semantics, Proving},
}

@inproceedings{AuJo89,
author = {Augustsson, Lennart and Johnsson, Thomas},
title = {{Parallel Graph Reduction with the $\langle{v,G}\rangle$-machine}},
booktitle = {{FPCA '89, Functional
Programming Languages and Computer Architecture}},
address = {London, UK, September 11--13},
year = {1989},
pages = {202--213},
publisher = {ACM Press, New York},
abstract = {We have implemented a parallel graph reducer on a commercially
available shared memory multiprocesor (a Sequent Symmetry) that achieves real
speedup compared to a fast compiled implementation of the conventional
G-machine. Using 15 procesors, this speedup ranges between 5 and 11 depending
on the program. Underlying the implementation is an abstract machine called
the $\langle{v,G}\rangle$-machine. We describe the sequential and the parallel
$\langle{v,G}\rangle$-machine, and our implementation of them. We provide
performance and speedup figures and graphs.},
keywords = {Programmed Graph Reduction},
}

@article{BBKR89,
author = {Bevan, D. I. and Burn, G. L. and Karia, R. J. and Robson, J. D.},
title = {{Principles For the Design of a Distributed Memory Architecture for
Parallel Graph Reduction}},
year = {1989},
journal = compj,
volume = {32},
number = {5},
pages = {461--469},
abstract = {Many models for the parallel reduction of lazy functional
languages have been proposed in the literature. The one we have chosen to
implement is based on evaluation transformers. An evaluation transformer says
how much evaluation can be done to an argument expression in a function
application, given the amount of evaluation that can be done to the
application. Rather than just selecting a distributed memory architecture and
trying to support parallel graph reduction, we investigate the implication of
a minimally specified distributed memory architecture for parallel graph
reduction. The results of the investigation are incorporated into an abstract
machine which is able to support the communication and synchronisation needs
of the parallel reduction model on a distributed memory architecture. Certain
flags are needed on the nodes in the program graph in order to support the
model. These are motivated and described.},
keywords = {Parallel Graph Reduction, Evaluation Transformers},
}

@inproceedings{BDMT86,
author = {Boyle, James M. and Dritz, Kenneth, W. and Muralidharan, M. N. and
Taylor, Robert J.},
title = {{Deriving Sequential and Parallel Programs from Pure Lisp
Specifications by Program Transformation}},
booktitle = {Program Specification and Transformation},
address = {Bad T{\"o}lz, Germany, April 15--17},
year = {1986},
pages = {2--19},
publisher = {North Holland},
abstract = {We describe an ``industrial strength'' example of the use of
specification and transformation to produce efficient programs. The
specification is written in pure Lisp. Program transformations have been used
to generate from it a program that executes efficiently on a wide variety of
sequential machines. Another set of transformations has been used to generate
programs from it that execute efficiently on global-memory parallel machines.
We discuss some of the design descisions for the parallel version (and their
motivations), optimization of parallel programs, and the benefits of the
specification-transformation approach.},
keywords = {Program Transformation, Parallelization, Lisp},
}

@inproceedings{BeBe90,
author = {Berry, Gerard and Boudol, Gerard},
title = {{The Chemical Abstract Machine}},
booktitle = {{Seventeenth Annual ACM Symposium onn Principles of Programming 
Languages}},
address = {San Francisco, California, January 17--19},
year = {1990},
publisher = {ACM Press, New York},
pages = {81--94},
abstract = {We introduce a new kind of abstract machine based on the chemical
metaphor used in the $\Gamma$ language of Banatre \& al. States of a machine
are chemical solutions where floating molecules can interact according to
reaction rules. Solutions can be stratified by encapsulating subsolutions
within membranes that force reactions to occur locally. We illustrate the use
of this model by describing the operational semantics of the TCCS and CCS
process calculi. We also show how to extract a higher-order concurrent
$\lambda$-calculus out of the basic concepts of the chemical abstract
machine.}, 
keywords = {Abstract Machine},
}

@article{Bird84,
author = {Bird, R. S.},
title = {{Using Circular Programs to Eliminate Multiple Traversals of Data}},
journal = acta,
year = {1984},
volume = {21},
pages = {239-250},
abstract = {This paper describes a technique for transforming functional
programs that repeatedly traverse a data structure into more efficient
alternatives that do not. The transformation makes essential use of lazy
evaluation and local recursion (such as provided by letrec, or its equivalent)
to build a circular program that, on one pass over the structure, determines
the effects of the individual traversals and the combines them.},
keywords = {Circular Programs},
}

@article{BHY88,
author = {Bloss, Adrienne and Hudak, P. and Young, J.},
title = {{An Optimizing Compiler for a Modern Functional Language}},
journal = compj,
volume = {31},
number = {6},
year = {1988},
pages = {152--161},
abstract = {One of the factors hindering the use of functional languages has
been their relatively poor performance in comparison to more traditional
languages such as C and Pascal. During the last decade tremendous progress has
been made in building implementations of functional languages but the
approaches adopted have employed specialist hardware and/or compiler
optimizations that have been developed specifically for functional languages.
Building specialist hardware may be the best long-term solution but in the
short run it is possible to increase the use and acceptance of functional
languages by exploiting the performance of commercially available machines.
The goal of this project described in this paper has been to design an
optimizing compiler that produces fast code for functional languages on
conventional sequential and parallel machines.},
keywords = {Compilation, Strictness Analysis, Termination Analysis,
Destructive Aggregate Updating},
}

@techreport{BMN*88,
author = {Brandenburg, Joe and Mirashrafi, Mojy and McNamara, Jean and
Billstrom, David and Gilbert, Eric and Ferguson, Mark and
Sizer, Andrew and Alleyne, Paul},
title = {{Advances in Lisp for the iPSC/2}},
institution = {Intel Scientific Computers},
year = {1988},
type = {Technical Report},
address = {Beaverton, Oregon},
abstract = {A complete redesign of Common Lisp for the Intel iPSC/2 sytem
resulted in a new programming model based on multiple threads of execution,
substantial performance gains exceeding all previously published timings for
gabriel benchmarks, and a flexible user interface to manage the complexity of
concurrently executing Lisp environments. Although multiple thread execution
is not an unknown model of processing, the integration of threads with
existing iPSC Lisp constructs provided new insight into certain concurrent
applications. The performance of iPSC/2 Lisp is shown to be significantly
faster than PSL on a Cray-XMP supercomputer, for two of the largest Gabriel
benchmarks, iPSC/2 Lisp is shown to be the highest performance Lisp system
available, commercially or in academia. The user interface allows direction
and redirection of keyboard input, program output, and the handling of errors
to multiple screens --- important for large concurrent programs. Specific
information is provided on iPSC/2 Lisp performance on the Gabriel Benchmarks;
on the differences between iPSC/2 Lisp and CCLISP for the iPSC/2; and on the
flexibility of the new user interface. Examples are given for each of the new
constructs, user interface, and the performance.},
keywords = {Lisp, Explicit Parallelism},
}

@inproceedings{BNA91,
author = {Barth, Paul S. and Nikhil, Rishiyur S. and Arvind},
title = {{M-Structures: Extending a Parallel, Non-strict, Functional Language
with State}},
booktitle = {{Functional Programming Languages and Computer Architectures}},
address = {Harvard, Massachusetts},
year = {1991},
editor = {Hughes, John},
series = {Lecture Notes in Computer Science},
volume = {523},
publisher = {Springer, Berlin},
pages = {538--568},
abstract = {Functional programs are widely appreciated for being
``declarative'' (algorithms are not {\em over-specified\/}) and implicitly
parallel. However, when modeling state, both properties are compromised
because the state variables must be ``threaded'' through most functions.
Further, state ``updates'' may require much copying. A solution is to
introduce assignments, as in ML and Scheme; however, for meaningful semantics,
they resort to strict, sequential evaluation. We present an alternative
approach called {\em M-structures\/}, which are imperative data structures
with implicit synchronization. M-structures are added to Id, a parallel
non-strict functional language. We argue that some programs with M-structures
improve on their functional counterpart in three ways; they are (1) more {\em
declarative\/}; (2) more {\em parallel\/}; and (3) more {\em storage
efficient\/}. Points (1) and (2) contradict conventional wisdom. We study two
problems: histogramming a tree, and graph traversal, comparing functional and
M-structure solutions.},
keywords = {State, Updatable arrays, Non-determinism},
}

@inproceedings{Burn87,
author = {Burn, Geoffrey L.},
title = {{Evaluation Transformers --- A Model for the Parallel Evaluation of
Functional Languages (Extended Abstract)}},
booktitle = {{Functional Programming Languages and Computer Architecture}},
editor = {Kahn, Gilles},
address = {Portland, Oregon, USA, September 14--16},
year = {1987},
pages = {446--470},
series = {Lecture Notes in Computer Science},
volume = {274},
publisher = {Springer, Berlin},
abstract = {If we are not careful, a parallel machine may become swamped with
the computation of the expressions whose values are not needed in order to
produce the result from a program. We give a semantic criterion which ensures
that this does not happen. An abstract interpretation can be developed which
gives the definedness of a function in terms of the definedness of its
arguments. Traditionally this has been used to give a {\em strictness
analysis\/} which has been interpreted to say how much the argument in an
application can be evaluated in parallel with the application while still
satisfying the semantic criterion. Strictness analysis however only takes into
account local information. By taking into account contextual information of an
application, we are able to find that in many cases more evaluation of the
argument is allowed. This information is available from the same abstract
interpretation that is used for strictness analysis. Given how much evaluation
is allowed of a function application, an {\em evaluation transformer\/} says
how much evaluation of the argument in the application is allowed. This leads
to a natural model of the parallel evaluation of functional languages.},
keywords = {Strictness Analysis, Abstract Interpretation, Evaluation
Transformers}, 
}

@techreport{Burn88a,
author = {Burn, G. L.},
title = {{A Shared Memory Parallel G-machine Based on the Evaluation
Transformer Model of Computation}},
type = {Draft},
month = {November},
year = {1988},
institution = {GEC Hirst Research Centre},
address = {Wembley, UK},
abstract = {The evaluation transformer model of parallel reduction is a
natural extension of lazy evaluation. It is therefore sensible to use the
G-machine as a basis for defining a parallel machine supporting the model.
Information from the model is compiled into the code for the parallel
G-machine, and uses a number of special instructions defined for creation and
sysnchronization of subtasks. The instructions from the sequential machine
remain essentially unchanged. Besides the evaluation transformer model, we
incorporate a number of important concepts into our definition that are not
present in other definitions of parallel G-machines. A full definition of the
compilation schemes and transition rules for the instructions are given in an
appendix.}, 
keywords = {Parallel Graph Reduction, Evaluation Transformers, G-Machine},
}

@inproceedings{BuGu85,
author = {Bush, V. J. and Gurd, J. R.},
title = {{Transforming Recursive Programs for Execution on Parallel Machines}},
editor = {Jouannaud, Jean-Pierre},
booktitle = {{FPCA '85, Functional Programming Languages and Computer
Architecture}},
address = {Nancy, France, September 16--19},
year = {1985},
pages = {350--367},
series = {Lecture Notes in Computer Science},
volume = {201},
publisher = {Springer, Berlin},
abstract = {A linear recursive scheme makes
just one recursive call; when it is executed, the resulting computation is
sequential. Under certain conditions, there is an equivalent double recursive
(bilinear) scheme which contains two independent recursive calls. The
computation corresponding to this is parallel; the recursion causes a
tree-like expansion of function calls. This recursive form is useful for
exploiting the large amounts of parallelism available in dataflow and
reduction machine architectures. On the other hand, the number of processors
in such a parallel machine may be limited, in which case it might be desirable
to convert a bilinear scheme to linear form by reversing the direction of the
above transformation. In fact, the optimum strategy may be to transform
dynamically between the two recursive forms, depending on the run-time
availability of parallel processing resources.
This paper describes the conditions under which linear and
bilinear recursive forms are equivalent and demonstrates the importance and
utility of establishing such a correspondence.},
keywords = {FP, Recursion, Parallelization},
}

@inproceedings{BlHu86,
author = {Bloss, Adrienne and Hudak, Paul},
title = {{Variations on Strictness Analysis}},
booktitle = {{1986 ACM Symposium on Lisp and Functional Programming}},
address = {Cambridge, Massachusetts, August 4--6},
year = {1986},
pages = {132--142},
publisher = {ACM Press, New York},
abstract = {This paper discusses the analysis of first-order functional
programs in order to answer the following four questions: what definitely will
be evaluated after some expression $e$ is evaluated, what definitely won't be
evaluated after $e$ is evaluated, what definitely was evaluated before $e$ was
evaluated, what definitely wasn't evaluated before $e$ was evaluated.},
keywords = {Strictness Analysis},
}

@article{BHA86,
author = {Burn, Geoffrey L. and Hankin, Chris and Abramsky, Samson},
title = {{Strictness Analysis for Higher-Order Functions}},
journal = {Science of Computer Programming},
volume = {7},
year = {1986},
pages = {249--278},
note = {Also: Graph Reduction, Proceedings of a Workshop, volume 217 of
Lecture Notes of Computer Science, pages 42--63, Springer, Berlin, 1985.},
abstract = {Abstract Interpretation is a compile-time technique which is used
to gain information about a program that may then be used to optimize the
execution of the program. A particular use of abstract interpretation is the
strictness analysis of functional programs. This provides the key to the
exploitation of parallelism in the evaluation of programs written in
functional languages. In a language that has lazy semantics, the main
potential for parallelism arises in the evaluation of operands of strict
operators. A function is strict in an argument if its value is undefined
whenever the argument is undefined. If we can use strictness analysis to
detect which arguments a function is strict in, we then know that these
arguments can be safely evaluated in parallel because this will not affect the
lazy semantics. Experimental results suggest that this leads to significant
speedups. Mycroft was the first person to apply abstract interpretation to the
strictness analysis of functional programs. His framework only applies to
first-order functions on flat domains. Many workers have proposed practical
approaches to strictness analysis of higher-order functions over flat base
domains but their work has not been accompanied by extensions to Mycroft's
theoretical framework. In this paper, we give sound mathematical foundations
for this work and discuss some of the practical issues involved. The practical
approach is proved correct in relation to the theoretical framework.},
keywords = {Strictness Analysis, Abstract Interpretation},
}

@techreport{Burn90x,
author = {Burn, Geoffrey},
title = {{A New Domain For Head-Strictness}},
institution = {Department of Computing},
address = {Imperial College, London, UK},
month = {January},
year = {1990},
abstract = {A new domain is introduced that captures the notion of
head-strictness in functional programming languages.},
keywords = {Strictness Analysis, Abstract Interpretation},
}

@inproceedings{Burn90a,
author = {Burn, Geoffrey L.},
title = {{A Relationship Between Abstract Interpretation and Projection
Analysis (Extended Abstract)}},
booktitle = {POPL Symposium on Principles of Programming Languages, San
Francisco, CA, January 17--19},
year = {1990},
abstract = {Abstract interpretation and projection analysis are two techniques
for finding out information about lazy functional programs. Two typical uses
of these techniques are speeding up sequential implementations, and the
introduction of parallelism into parallel implementations. Our main result is
the proof of a relationship between a certain class of projections and a
certain class of abstract interpretations. One of the claims of projection
analysis is that it can find out information about head-strictness, whilst
abstract interpretation cannot. We show that there are at least two intuitive
notions of head-strictness, and that one of them can be determined using
abstract interpretation.},
keywords = {Strictness Analysis, Abstract Interpretation, Projection
Analysis},
}

@inproceedings{Burn90b,
author = {Burn, Geoffrey L.},
title = {{The Evaluation Transformer Model of Reduction and Its Correctness}},
booktitle = {TAPSOFT 91, Brighton, UK, April 9-12},
note = {Also: Technical Report DOC 90/19, Department of Computing, 
Imperial College, London},
year = {1991},
abstract = {Lazy evaluation of functional programs incurs time and memory
overheads, and restricts parallelism, when compared with programs that are
evaluated strictly. A number of analysis techniques, such as abstract
interpretation and projection analysis, have been developed to find out
information that can be used to alleviate these overheads. This paper
formalises an evaluation model, the evaluation transformer model of reduction,
which can use information form these analysis techniques, and proves that the
resulting reduction strategies produce the same answers as those obtained
using lazy evaluation.},
keywords = {Strictness Analysis, Evaluation Transformers},
}

@inproceedings{Burn90c,
author = {Burn, Geoffrey L.},
title = {{Using Projection Analysis in Compiling Lazy Functional Programs}},
booktitle = {ACM Conference on Lisp and Functional Programming},
address = {Nice France, June 27--29},
year = {1990}, 
pages = {227--241},
publisher = {ACM Press, New York},
abstract = {Projection Analysis is a technique for finding out information
about lazy functional programs. We show how the information obtained from this
analysis can be used to speed up sequential implementations, and introduce
parallelism into parallel implementations. The underlying evaluation model is
evaluation transformers, where the amount of evaluation that is allowed of an
argument in a function application dependsss on the amount of evaluation
allowed to the application. We prove that the transformed programs preserve
the semantics of the original programs. Compilation rules, which encode the
information from the analysis, are given for sequential and parallel
machines.}, 
keywords = {Strictness Analysis, Projection Analysis, Evaluation Transformers},
}

@article{Burn91a,
author = {Burn, Geoffrey L.},
title = {{Implementing the Evaluation Transformer Model of Reduction on
Parallel Machines}},
journal = {Journal of Functional Programming},
volume = {1},
number = {2},
month = {April},
year = {1991},
note = {Also Technical Report DOC 90/24, Department of Computing, Imperial
College, London, UK},
abstract = {The evaluation transformer model of reduction generalises lazy
evaluation in two ways: it can start the evaluation of expressions before
their first use, and it can evaluate expressions further than weak head normal
form. Moreover, the amount of evaluation required of an argument to a function
may depend on the amount of evaluation required of the function application.
It is a suitable candidate model for implementing lazy functional languages on
parallel machines. In this paper we explore the implementation of lazy
functional languages on parallel machines, both shared and distributed memory
architectures, using the evaluation transformer model of reduction. We will
see that the same code can be produced for both styles of architecture, and
the definition of the instruction set is virtually the same for each style.
The essential difference is that a distributed memory architecture has one
extra node type for non-local pointers and instructions which involve the
value of such nodes need their definitions extended to cover this new type of
node. To make our presentation accessible, we base our description on a
variant of the well-known G-machine, an abtract machine for executing lazy
functional programs.},
keywords = {Strictness Analysis, Evaluation Transformers},
}

@Article{Burt84,
author = {Burton, F. Warren},
title = {{Annotations to Control Parallelism and Reduction Order in
the Distributed Evaluation of Functional Programs}},
journal = toplas,
year = {1984},
volume = {6},
number = {2},
pages =	{159--174},
month =	{April},
abstract = {When evaluating a functional program on a network of processors,
it is necessary to decide when parallelism is desirable, which work may be
transferred to another processor, and in what form the work is to be
transferred. If the wrong decisions are made, a parallel evaluation may
require asymptotically more time than a sequential evaluation, owing to
communication costs. The introduction of annotations to give the programmer
control over the above decisions is proposed. The annotations and their
effects are definied in terms of the lambda calculus. Each application must
have one of three annotations. No other annotations are required. Examples and
possible extensions to this work, including annotated combinators, are briefly
considered.},
keywords = {Para-Functional Programming, Annotations},
}

@article{Burt88,
author = {Burton, F. W.},
title = {{Nondeterminism with Referential Transparency in Functional
Programming Languages}},
journal = compj,
volume = {31},
number = {3},
year = {1988},
pages = {243--247},
abstract = {Procedural programmers have found nondeterminism to be unavoidable
in many parallel programs (e.g.\ operating systems). If functional programming
languages are to be usable as general-purpose parallel-programming languages
then provision of some nondeterministic language constructs appears to be
necessary. Existing constructs for nondeterminism are not functional and are
not compatible with the mathematical foundations of functional programming,
which require that the value of a function be uniquely determined by the
values of its arguments. We propose to solve this problem by placing the
nondeterminism in pseudo-data. A program is passed an infinite tree of
two-valued decisions, along with its input. These decisions may be fixed at
run time, thereby permitting nondeterminism. Once fixed, a decision remains
unchanged, so equivalent expressions must always have the same value. The
approach generalizes so that a program can make use of other run-time
information such as the current time or the current amounts of available
storage.}, 
keywords = {Non-Determinism, Referential Transparency},
}

@Article{Burt87,
author = {Burton, F. W.},
title = {{Functional Programming for Concurrent and Distributed Computing}},
journal = compj,
year = {1987},
volume = {30},
number = {5},
pages =	{437--450},
abstract = {There are at least two approaches to the desing of languages for
parallel computing. One approach is to use functional or relational languages
which are easy to read, write, transform and verify. The more conventional
approach is to use procedural langauges which give a programmer a high degree
of control over the run-time behavior of a program. There is a need to
reconcile these two approaches in a language which permits both simplicity and
efficiency. We propose a small and simple set of annotations (or pragmas) to
control the run-time behavior of a functional progrma. The annotations allow
a programmer to use three forms of parameter passing. The parameter-passing
mechanisms correspond to passing by name, value and need in a sequential
language. In addition, in a distributed system a programmer can specify that
work should be done on the current processor, an arbitrary processor, or a
particular processor such as the one containing a specific data item. The
annotatins cannot affect the meaning (result) of a functional program, except
for causing non-termination in some cases (which we view as an extreme form of
inefficiency). This separation of meaning from control allows a program to be
both simple and efficient. Since non-determinism appears to be unavoidable
without significant loss of efficiency in a concurrent system, the interaction
of the proposed annotations with non-determinism is briefly considered. The
run-time behavior of an annotated functional program is similar to that of a
procedural programs using message passing, semaphores ore rendezvous to
control communication and synchronization.},
keywords = {Para-functional programming, Annotations, Process Mapping,
Non-determinism}, 
}

@inproceedings{Chen86,
author = {Chen, Marina C.},
title = {{Crystal: A Synthesis Approach to Programming Parallel Machines}},
booktitle = {First Conference on Hypercube Multiprocessors, Knoxville,
Tennessee, August 26--27},
year = {1985},
editor = {Health, Michael T.},
pages = {87--107},
abstract = {This paper addresses two software issues in parallel processing:
Fiirst, how to program parallel machines effectively, and second, how to
realize the high performance promised by the parallel hardware technology. An
overview of the language Crystal and a programming methodology for a class of
computational problems are presented. Crystal provides high-level abstractions
for the behavior of an ensamble of parallel processors, and is devised for the
purpose of managing effectively the complexity of programming large parallel
systems. To address the issue of effective utilization of hardware
technologies, a design methodology is developed for syntehsizing efficient
systolic algorithms and architectures from problem definitions.},
keywords = {Dataflow, Crystal},
}

@techreport{ChGo91,
author = {Chuang, Tyng-Ruey and Goldberg, Benjamin},
title = {{Backward Analysis for Higher-Order Functions Using Inverse Images}},
institution = {Department of Computer Science},
address = {New York University, NY},
month = {February},
year = {1991},
abstract = {We propose a method for performing backward analysis on
higher-order functional programming languages based on computing inverse
images of functions over abstract domains. The method can be viewed as
abstract interpretation done backward. Given an abstract semantics which
supports forward analysis, we can transform it into an abstract semantics
which performs backward analysis. We show that if the original semantics is
correct and computable, then the transformed version of the abstract semantics
is also correct and computable. More specifically, given a forward abstract
semantics of a higher-order functional language which is expressed in terms of
Scott-closed powerdomains, we derive a backward abstraction semantics which is
expressed in terms of Scott-open powerdomains. The derivation is shown to be
correct and the relationships between forward analysis and backward analysiss
is established. We apply this method to the classic strictness analysis in
functional languages and obtain promising results. We show that the time
complexity of inverse image based backward analysis is not much worse than the
forward analysis from which it is derived. We then compare this work with
previous works of Wadler, Hughes, and Burn, showing that some special
restrictions and constructions in previous works have natural interpretation
in the Scott-closed/Scott-open powerdomain framework. A brief outline of
applying the inverse image method to other backward semantics analysis is also
given.},
keywords = {Strictness Analysis, Abstract Interpretation, Backwards Analysis},
}

@techreport{Clar90,
author = {Clarke, Thomas},
title = {{The Semantics and Implementation of Aggregates or How to Express
Concurrency without Destroying Determinism}},
institution = {Computer Laboratory},
address = {University of Cambridge},
month = {December},
year = {1990},
abstract = {Associative commutative operations lead to algorithms with
concurrency which cannot be adequately represented by the pure lambda
calculus. This paper investigates two classes of associative commutative
operation which express extra concurrency, and shows that they preserve a
simple denotational semantics. {\em Simple aggregates\/} encapsulate the
evaluation order indeterminancy implicit in strict associative commutative
operations and have no effect on semantics. {\em Early Readable Aggregates\/}
encapsulate program values which are defined by a lower bound their
denotations. The denotational join on ERA values is an associative commutative
operation which can be used to express concurrency. Specific aggregates can be
used to implement concurrent graph marking, time deterministic merge of lazy
lists, and write-once locations.},
keywords = {Non-determinism, Aggregate},
}

@inproceedings{ClPe85,
author = {Clack, Chris and {Peyton Jones}, Simon L.},
title = {{Strictness Analysis --- A Practical Approach}},
booktitle = {{Functional Programming Languages and Computer Architecture}},
editor = {Jouannaud, Jean-Pierre},
address= {Nancy, France, September 16--19},
year   = {1985},
volume = {201},
series = {Lecture Notes in Computer Science},
publisher =  {Springer, Berlin},
pages  = {35--49},
abstract = {Significant improvements in performance arise if we can arrange
for parallel execution of programs. The absence of side effects in functional
languages allows concurrent evaluation of the program, but in lazy
implementations this risks wasting work by evaluating expressions which are
subsequently discarded. We discuss the use of strictness analysis to determine
at complie time which parts of program evaluation can safely be carried out
concurrently. We give a practical explanation of this technique, concentrating
particularly on the problem of finding fix points.},
keywords = {Strictness Analysis, Abstract Interpretation, Fixed Points},
}

@inproceedings{Cole88,
author = {Cole, M. I.},
title = {{A `Skeletal' Approach to the Exploitation of Parallel Computation}},
booktitle = {CONPAR 88},
address = {Manchester, UK, September 12--16},
year = {1988},
pages = {100--107},
volume = {C},
publisher = {British Computer Society},
abstract = {This paper introduces a new approach to the design of programming
ystems which can exploit parallelism while maintaining a reasonable level of
abstraction from the underlying hardware. This is based on the concept of
``algorithmic skeletons''. The abstract specification of four such skeletons
are presented and a possible implementation on a grid of processing units is
outlined.}, 
keywords = {Skeletons, Para-Functional Programming},
}

@techreport{CoMo90,
author = {Cooper, Eric C. and Morrisett, J. Gregory},
title = {{Adding Threads to Standard ML}},
number = {CMU-CS-90-186},
institution = {School of Computer Science},
address = {Carnegie Mellon University, Pittsburgh, PA},
month = {December},
year = {1990},
abstract = {We have added multiple threads of control to the Standard ML
programming language. Stanford ML's support for first-class functions and
automatic storage management influenced the design in a number of ways. We
demonstrate how other concurrency and synchronization operations, such as
cobegin/coend, futures, and events, can be implemented in terms of the thread
interface. Finally, we describe three implementations of the thread interface:
a coroutine version, a uniprocessor preemptive version, and a multiprocessor
Mach-based version.},
keywords = {ML, Threads},
}

@techreport{CRZ83,
author = {Cohen, Shismon and Rosner, Roni and Zidon, Ari},
title = {{PARALISP Simulator (Reference Manual)}},
type = {Research Report},
number = {83-2},
institution = {Computer Science Department},
address = {Hebrew University, Jerusalem, Israel},
month = {January},
year = {1983},
abstract = {PLISP (``Parallel Lisp'') is the current implementation of
``Senile Parallel Execution of Lisp Programs'' (Lehmann and Cohen). PLISP may
be thought of as a virtual machine designed to evaluate LISP expressions in a
parallel fashion. The language of this virtual machine is PARALISP. This
means, for example, that when a function application contains several formal
arguments, these arguments may be evaluated in parallel, under the assumption
that there are no dependencies, or side effects. Therefore, PARALISP is a
version of LISP designed to enable parallel evaluation to occur without side
effects.}, 
keywords = {Lisp},
}

@inproceedings{DaRe81,
author = {Darlington, John and Reeve, Mike},
title = {{ALICE --- A Multi-Processor Reduction Machine for the Parallel
Evaluation of Applicative Languages}},
booktitle = {{FPCA '81, Functional Programming Languages and Computer
Architecture}}, 
address = {Boston, Massachusetts},
year = {1981},
pages = {65--74},
publisher = {ACM Press, New York},
abstract = {In this paper we present a scheme for the parallel evaluation of a
wide variety of applicative languages and provide an overview of the
architecture of a machine on which it may be implemented.},
keywords = {Parallel Graph Reduction},
}

@inproceedings{DKL*88,
author = {Darlington, John and Kohshnevisan, Hessam and McLoughlin, Lee and
Perry, Nigel and Pull, Hellen and Sephton, Keith and While, Lyndon},
title = {{An Introduction to the Flagship Programming Environment}},
booktitle = {CONPAR 88},
address = {Manchester, UK, September 12--16},
year = {1988},
pages = {75--86},
volume = {A},
publisher = {British Computer Society},
abstract = {The Flagship programming environment provides support for the
development, implementation and modification of programs. Its novelty is that
it is designed to support programming in a pure functional language and to
give mechanical assistance to the process of deriving programs from high level
specifications via correctness preserving transformations. The target for
these programs is both the Flagship Parallel Machine and conventional parallel
machines.},
keywords = {Program Transformation},
}

@inproceedings{DRW89,
author = {Darlington, J. and Reeve, M. and Wright, S.},
title = {{Programming Parallel Computer Systems using Functional Languages and
Program Transformation}},
booktitle = {Parallel Processing '89},
address = {Leiden, The Netherlands},
year = {1989},
abstract = { One of the major problems associated with parallel computer
systems is the economical production of software that fully exploits the
hardware. Functional languages facilitate a formal methodology for the
development of programs for parallel machines.  This paper provides an
overview of one program development methodology developed at imperial College,
and presents a case study that illustrates its operation and quantifies its
benefits using results of experiments carried out on the ALICE parallel
machine.},
keywords = {Parrallel Graph Reduction},
}

@manual{Dimi88,
author = {Dimitrovsky, Isaac},
title = {{ZLISP 0.85 Reference Manual}},
organization = {Ultracomputer Research Laboratory},
address = {NewYork},
year = {1988},
month = {March},
abstract = {ZLISP is a portable parallel LISP system for shared memory MIMD
supercomputers. It contains several extensions to LISP to support the writing
of parallel programs. In the current version, all parallelism is explicit
although this may change in the future.},
keywords = {Lisp, Futures},
}

@techreport{EkAr87,
author = {Ekanadham, Kattamuri and Arvind},                 
title = {{SIMPLE: Part 1 --- An Exercise in Future Scientific Programming}},
type = {Computation Structures Group Memo},
number = {273},
institution = {Laboratory for Computer Science},
address = {Massachusetts Institute of Technology, Cambridge, MA},
month = {July},
year = {1987},
abstract = {Ideally, a high-level language should provide a way of writing
abstractions which are as close to the problem domain as possible, as well as
facilitate efficient implementations of these abstractions lest a user try to
``get underneath'' the abstractions. With the advent of parallel machines, a
language such as FORTRAN fails on both counts. Functional and other
declarative languages allegedly offer relief on both counts. The use of higher
order functions, including the free use of curried forms, can dramatically
raise the level of programming. In addition, such languages often have
straight-forward operational semantics which admits tremendous opportunities
for parallel execution. Programs in declarative languages, thus eliminate the
problem of ``detecting parallelism''. In this report we have examined the
first part of this claim by writing an application known as the SIMPLE code in
a language called Id Nouveau. The issues of parallelism will be examined in a
companion report, Part II, to be published later. We have presented a
high-level description of the algorithms used in the hydrodynamics problem,
called SIMPLE. Corresponding Id Noveau program fragments are derived from the
mathematical descriptions. The emphasis is on the style of programming, and
not on algorithmic cleverness. The resulting program has 550 lines and runs
successfully in ID World, which is a compiler for Id Noveau together with an
abstract simulation facility for a dataflow machine. This report presumes the
knowledge of Id Noveau, functional programming and I-structures.},
keywords = {Scientific Programming, Dataflow, Id, I-Structures},
}

@techreport{FeMi89,
author = {Feeley, Marc and Miller, James S.},
title = {{A Parallel Virtual Machine for Efficient Scheme Compilation}},
institution = {Brandeis University},
address = {Waltham, Massachusetts},
year = {1989},
abstract = {Programs compiled by Gambit, our Scheme compiler, achieve
performance as much as twice that of the fastest available Scheme compilers.
Gambit is easily ported, while retaining its high performance, through the use
of a simple virtual machine (PVM). PVM allows a wide variety of
machine-independent optimizations and it supports parallel computation based
on the {\tt future} construct. PVM conveys high-level information
bidirectionally between the machine-independent front end of the compiler and
the machine-dependent back end, making it easy to implement a number of common
back end optimizations that are difficult to achieve for other virtual
machines. PVM is similar to many real computer architectures and has an option
to efficiently gather dynamic measurements of virtual machine usage. These
measurements can be used in performance prediction for ports to other
architectures as well as design decisions related to proposed optimizations
and object representations.},
keywords = {Lisp, Futures, Virtual Machine},
}

@article{FrWi78,
author = {Friedman, Daniel P. and Wise, David S.},
title = {{Aspects of Applicative Programming for Parallel Processing}},
journal = i3ec,
volume = {27},
number = {4},
month = {April},
year = {1978},
pages = {289--296},
abstract = {Early results of a project on compiling stylized recursion into
stackless iterative code are reviewed as they apply to a target environment
with multiprocessing. Parallelism is possible in executing the compiled image
of argument evaluation (collateral argument evaluation in Algol 68), of data
structure construction when suspensions are used, and of functional
combinations. The last facility provides generally, concise expression for all
operations performed in Lisp by mapping functions and in APL by typed
operators; there are other uses as well.},
keywords = {Combinators},
}

@inproceedings{GaLe87,
author = {Gaudiot, J. L. and Lee, L. T.},
title = {{Multiprocessor Systems Programming in a High-Level Data-Flow
Language}},
booktitle = {{PARLE '87, Parallel Architectures and Languages Europe, Volume 1:
Parallel Architectures}},
editor = {de Bakker, J. W. and Nijman, A. J. and Treleaven, P. C.},
address = {Eindhoven, The Netherlands, June 15--19},
year = {1987},
pages = {134--151},
series = {Lecture Notes in Computer Science},
volume = {258},
publisher = {Springer, Berlin},
abstract = {The data-flow model of computation is an attractive methodology
for multiprocessor programming for it offers the potential for unlimited
parallelism detection at no programmer's expense. It is here applied to a
distributed architecture based on a commercially available microprocessor (the
Inmos Transputer). In this project, we have integrated the high-level data
driven principles of scheduling within the Transputer architecture so as to
provide high programmability of our multicomputer system. A completet
programming environment which translates a complex data-flow program graph
into occam has been developed and is presented in this paper. We here describe
in detail the mapping from the SISAL high-level constructs into the low-level
mechanisms of the Transputer (synchronization, structure representation,
etc.). The partitioning issues (granularity of the graph) are presented and
several solutions based upon both data-flow analysis (communication costs) and
program syntax (program structure) are proposed and have been implemented in
our programming environment. Finally, we present and analyze graph allocation
and optimization schemes to improve the performance of the resulting occam
program.}, 
keywords = {SISAL, Dataflow},
}

@inproceedings{Geor89,
author = {George, Lal} ,                             
title = {{An Abstract Machine for Parallel Graph Reduction}},                  
booktitle = {{FPCA '89, Functional
Programming Languages and Computer Architecture}},
address = {London, UK, September 11--13},
year = {1989},
pages = {214--229},
publisher = {ACM Press, New York},
abstract = {An abstract machine for parallel graph reduction on a shared
memory multiprocessor is described. This is intended primarily for normal
order ({\em lazy\/}) evaluation of functional programs. It is absolutely
essential in such a design to adapt an efficient sequential model since during
execution under limited resources available, performance will be reduced in
the limit to that of the sequential engine. Parallel evaluation of normal
order functional languages performed naively can result in poor overall
performance despite the avilability of sufficient processing elements and
parallelism in the application. Needless context switching, task migration and
continuation building may occur when a sequential thread of control would have
sufficed. Furthermore, the compiler using static information cannot be fully
aware of the availability of resources and their optimal utilization at any
moment in run time. Indeed this may vary between runs which further aggravates
the job of the compiler writer in generating optimal and compact code for
programs. The benefits derived from this model are: 1) it is based on the
G-machine so that execution under limited resources will default to a
performance close to that of the G-machine; 2) the additional instructions
needed to control the complexities of paralllel evaluation are extremely
simple, almost trivializing the job of the compiler writer; 3) attempts are
made where possible to avoid context switching and task migration by retaining
a sequential thread of control (made more clear in the paper), and 4) the
method has demonstrated good overall performance on a shared memory
multiprocessor.},
keywords = {Parallel Graph Reduction, G-Machine},
}

@inproceedings{GHKP88,
author = {Glauert, J. R. W. and Hammond, K. and Kennaway, J. R. and
Papadopoulos, G. A.}, 
title = {{Using Dactl to Implement Declarative Languages}},
booktitle = {CONPAR 88},
address = {Manchester, UK, September 12--16},
year = {1988},
pages = {87--94},
volume = {A},
publisher = {British Computer Society},
abstract = {Dactl is a language of graph rewriting, intended as an
intermediate language and compiler target language for highly parallel
machines. It is based on a form of graph rewriting which can be used to
implement functional, logic, and imperative languages.},
keywords = {Intermediate Language, Compilation},
}

@article{GlDe87,
author = {Gl{\"u}ck, Robert and Demuth, Christian},
title = {{OC-FP: An Applicative Language Combination with Occam and the Algebra
of Processes}},
journal = {Microprocessing and Microprogramming},
volume = {21},
year = {1987},
pages = {549--558},
publisher = {North-Holland},
abstract = {A functional language based on the FP-model is presented. The
language combines occam processes and is used as a command level language for
occam systems. It is well suited as a graphical language for concurrent
programming. An algebra, based on the functional properties of the language,
makes process and communication structures accessible to (semi-)automatical
transformations.}, 
keywords = {FP, Occam, Process Algebra},
}

@article{GoGa89,
author = {Goldman, Ron and Gabriel, Richard P.},
title = {{Qlisp: Parallel Processing in Lisp}},
journal = {IEEE Software},
year = {1989},
pages = {51--59},
month = {July},
abstract = {There is a serious need for a powerful understandable
multiprocessing language. Qlisp is an extension to Common Lisp designed for
parallel symbolic computation.},
keywords = {Lisp, Futures, Parallel Let},
}

@inproceedings{GoGa88,
author = {Goldman, Ron and Gabriel, Richard P.},
title = {{Qlisp: Experience and New Directions}},
booktitle = {{PPEALS 1988, Parallel Programming:
Experience with Applications, Languages and Systems}},
address = {New Haven, Conneticut, July 19--21},
year = {1988},
series = {SIGPLAN Notices},
number = {23(9)},
month = {September},
publisher = {ACM Press, New York},
pages = {111--123},
abstract = {Qlisp, a dialect of Common Lisp, has been proposed as a
multiprocessing programming language which is suitable for studying the styles
of parallel programming at the medium-grain level. An initial version of Qlisp
has benn implemented on a multiprocessor and a number of experiments with it
conducted. This paper describes the implementation, reports on some of the
experiments, and presents some new constructs that ae suggested from
programming experience with Qlisp.},
keywords = {Lisp, Futures, Parallel Let},
}

@inproceedings{GoHu86,
author = {Goldberg, Benjamin and Hudak, Paul},
title = {{Alfalfa: Distributed Graph Reduction on a Hypercube Multiprocessor}},
booktitle = {{Graph Reduction, Proceedings of a Workshop}},
address = {Santa Fe, New Mexico, September 29 -- October 1},
year = {1987},
volume = {279},
series = {Lecture Notes in Computer Science},
publisher = {Springer, Berlin},
editor = {Fasel, Joseph H. and Keller, Robert M.},
pages = {94--113},
abstract = {Alfala is an implementation of a functional language on the Intel
iPSC multiprocessor. It is based on a heterogeneous abstract machine model
consisting of both graph reduction and stack oriented execution. Alfala
consists of two major components, a compiler and a run-time system. The source
language, Alfl, contains no constructs that allow the programmer to specify
parallelism or synchronization and thus it is the task of the compiler to
detect the exploitable parallelism in a program. The runtime system supports
dynamic scheduling, interprocessor communication, and storage management. A
number of statistics gathered during execution are presented.},
keywords = {Compilation, Serial Combinators, Scheduling},
}

@phdthesis{Gold88,
author = {Goldberg, Benjamin F.},
title = {{Multiprocessor Execution of Functional Programs}},
type = {Research Report YALEU/DCS/RR-618},
school = {Department of Computer Science},
address = {Yale University},
month = {April},
year = {1988},
abstract = {Functional languages have recently gained attention as vehicles
for programming in a concise and elegant manner. In addition, it has been
suggested that functional programming provides a natural methodology for
programming {\em multiprocessor\/} computers. This dissertation demonstrates
that multiprocessor execution of functional programs is feasible, and results
in a significant reduction of their execution times. Two implementations of
the functional language ALFL were built on commercially available
multiprocessors. {\em Alfala\/} is an implementation on the Intel iPSC
hypercube multiprocessor, and {\em Buckwheat\/} is an implementation on the
Encore Multimax shared-memory multiprocessor. Each implementation includes a
compiler that performs {\em automatic decomposition\/} of ALFL programs. The
compiler is responsible for detecting the inherent parallelism in a program,
and decomposing the program into a collection of tasks, called {\em serial
combinators\/}, that can be executed in parallel. One of the primary goals of
the compiler is to generate serial combinators exhibiting the coarsest
granularity possible without sacrificing useful parallelism. This dissertation
describes the algorithms used by the compiler to analyze, decompose, and
optimize functional programs. The abstract model supported by Alfala and
Buckwheat is called {\em heterogeneous graph reduction\/}, which is a hybrid of
graph reduction and conventional stack-oriented execution. This model supports
parallelism, lazy evaluation and higher order functions while at the same time
making efficient use of the processors in the system. The Alfala and Buckwheat
run-time systems support dynamic load balancing, interprocessor communication
(if required), and storage management. A large number of experiments were
performed on Alfala and Buckwheat for a variety of programs. The results of
these experiments, as well as the conclusions drawn from them, are
presented.}, 
keywords = {Parallelization, Serial Combinators, Graph Reduction},
}

@techreport{HaBo89,
author = {Harmer, Terence J. and Boyle, James M.},
title = {{An Optimization to Increase Parallelism in Parallel Pure Lisp
Implementations}},
institution = {Argonne National Laboratory},
address = {Argonne, Illinois},
year = {1989},
abstract = {In this paper we consider an optimization that may be applied to
the Lisp list constructor {\em cons\/} when parallel evaluation of function
arguments is possible. We present experimental results comparing
implementations of a typical function with and without this optimization.},
keywords = {Lisp, Lists},
}

@article{Hals85,
author = {{Halstead, Jr.}, Robert H.},
title = {{Multilisp: A Language for Concurrent Symbolic Computation}},
journal = toplas,
volume = {7},
number = {4},
month = {October},
year = {1985},
pages = {501--538},
abstract = {Multilisp is a version of the Lisp dialect Scheme extended with
constructs for parallel execution. Like Scheme, Multilisp is oriented toward
symbolic computation. Unlike some parallel programming languages, Multilisp
incorporates constructs for causing side effects and for explicitly
introducing parallelism. The potential complexity of dealing with side effects
in a parallel context is mitigated by the nature of the parallelism constructs
and by support for abstract data types: a recommended Multilisp programming
style is presented which, if followed, should lead to highly parallel, easily
understandable programs. Multilisp is being implemented on the 32 processor
{\em Concert\/} multiprocessor; however, it is ultimately intended for use on
larger multiprocessors. The current implementation, called {\em Concert
Multilisp\/}, is complete enough to run the Multilisp compiler itself and has
been run on Concert prototypes including up to eight processors. Concert
Multilisp uses novel techniques for task scheduling and garbage collection.
The task scheduler helps control excessive resource utilization by means of an
unfair scheduling policy; the garbage collector uses a multiprocessor
algorithm based on the incremental garbage collector of Baker.},
keywords = {Lisp, Futures},
}

@article{Hals86,
author = {{Halstead, Jr.}, Robert H.},
title = {{Parallel Symbolic Computing}},
journal = i3ec,
month = {August},
year = {1986},
volume = {19},
number = {8},
pages = {35--43},
abstract = {Programs differ from one another in many dimensions. In one such
dimension, programs can be laid out along a spectrum with predominantly
symbolic programs at one end and predominantly numerical programs at the
other. The difference between numerical and symbolic programs suggest
different approaches to parallel processing. This article explores the problem
and opportunities of parallel symbolic computing and describes the language
Multilisp, used at M.I.T. for experiments in parallel symbolic programming.},
keywords = {Multilisp, Symbolic Computation}
}

@techreport{Harr86,
author = {Harrison III, W. Ludwell},
title = {{Compiling Lisp for Evaluation on a Tightly Coupled Multiprocessor}},
type = {CSRD Report},
number = {565},
month = {March},
year = {1986},
institution = {Center of Supercomputing Research and Development},
address = {University of Illinois, Urbana, IL},
abstract = {The problem of compiling Lisp for efficient evaluation on a large,
tightly coupled, shared memory multiprocessor is investigated. A
representation for s-expressions which facilitates parallel evaluation is
proposed, along with a sequence of transformations, to be applied to the
functions comprising a Lisp program, which reveal and exploit parallelism.},
keywords = {Lisp, Compilation, Parallelization},
}

@techreport{Harr89,
author = {Harrison III, W. Ludwell},
title = {{The Interprocedural Analysis and Automatic Parallelization of Scheme
Programs}},
number = {CSRD No. 860, UILU-ENG-89-8003},
month = {February},
year = {1989},
institution = {Center of Supercomputing Research and Development},
address = {University of Illinois, Urbana, IL},
abstract = {Lisp and its descendants are among the most important and widely
used of programming languages. At the same time, parallelism in the
architecture of computer systems is becoming commonplace. There is a pressing
need to extend the technology of automatic parallelization that has become
available to Fortran programmers of parallel machines, to the realm of Lisp
programs and symbolic computing. In this thesis we present a comprehensive
approach to the compilation of Scheme programs for shared-memory
multiprocessors. Our strategy has two principal components: {\em
interprocedural analysis\/} and {\em program restructuring\/}. We introduce
{\em procedure strings\/} and {\em stack configurations\/} as a framework in
which to reason about interprocedural side-effects and object lifetimes, and
develop a system of interprocedural analysis, using abstract interpretation,
that is used in the dependence analysis and memory management of Scheme
programs. We introduce the transformations of {\em exit-loop translation\/}
and {\em recursion splitting\/} to treat the control structures of iteration
and recursion that arise commonly in Scheme programs. We propose an
alternative representation for s-expressions that facilitates the parallel
creation and access of lists. We have implemented these ideas in a
parallelizing Scheme compiler and run-time system, and we complement the
theory of our work with ``snapshots'' of programs during the restructuring
process, and some preliminary performance results of the execution of object
codes produced by the compiler.},
keywords = {Lisp, Parallelization, Abstract Interpretation},
}

@techreport{HaPe91,
author = {Hammond, Kevin and {Peyton Jones}, Simon},
title = {{Profiling scheduling strategies on the GRIP parallel reducer}},
institution = {Department of Computing Science},
address = {University of Glasgow, UK},
month =	 {August},
year = {1991},
abstract = {It is widely claimed that functional languages are particularly
suitable for programming parallel computers. A claimed advantage is that the
programmer is not burdened with details of task creation, placement,
scheduling, and synchronization, these decisions being taken by the system
instead. Leaving aside the question of whether a pure functional language is
expressive enough to encompass all the parallel algorithms we might wish to
program, there remains the question of how effectively the compiler and
run-time system map the program onto a real parallel system, a task usually
carried out mostly by the programmer. This is the question we address in this
paper. We first introduce the system architecture of GRIP, a shared-memory
parallel machine supporting an implementation of the functional language {\sc
Haskell}. GRIP executes functional programs in parallel using compiled
supercombinator graph reduction, a form of declarative rule system. We then
describe several strategies for run-time resource control which we have tried,
presenting comprehensive measurements for their effectiveness. We are
particularly concerned with strategies controlling task creation, in order to
improve task granularity and minimize communication overheads. This is, so far
as we know, one of the first attempts to make a systematic study of
task-control strategies in a high-performance parallel functional-language
system. GRIP's high absolute performance render these results credible for real
applications.},
keywords = {Process Scheduling, Load Balancing},
}

}

@article{HBP86,
author = {Hankin, Chris L. and Burn, Geoffrey, L. and Peyton Jones, Simon L.},
title = {{A Safe Approach to Parallel Combinator Reduction (Extended
Abstract)}},
journal = {Theoretical Computer Science},
volume = {56},
number = {1},
pages = {17--35},
year = {1988},
note = {Also: Extended Abstract in ESOP 86, European Symposium on Programming,
Saarbr{\"u}cken, Germany, March, 1986, pages 99--110, Robinet, B. and Wilhelm,
R. (editors), volume 213 of Lecture Nots in Computer Science, Springer,
Berlin}, 
abstract = {In this paper we present the results of two pieces of work which,
when combined, allow us to go from a program text in a functional language to
a parallel implementation of that program. We present techniques for
discovering sources of parallelism in a program at compile time, and then show
how this parallelism is naturally mapped into a parallel combinator set that
we will define. To discover sources of parallelism in a program, we use
abstract interpretation. Abstract interpretation is a compile-time technique
which is used to gain information about a program that may then be used to
optimize the execution of the program. A particular ue of abstract
interpretation is the strictness analysis of functional programs. In a
language that has lazy semantics, the main potential for parallelism arises
in the evaluation of operands of strict operators. A function is strict in an
argument if its value is undefined whenever the argument is undefined. If we
can use strictness analysis to detect which arguments a function is strict in,
we then know that these arguments can be safely evaluated in parallel because
this will not affect the lazy semantics. Having identified the sources of
parallelism at compile-time it is necessary to communicate these to the
run-time system. In the second part of the paper we use an extended set of
combinators, including a pair of parallel combinators that achieve this
purpose.},
keywords = {Strictness Analysis, Combinators, Director Strings},
}

@article{Huda86,
author = {Hudak, Paul},
title = {{Para-Functional Programming}},
journal = i3ec,
month = {August},
year = {1986},
volume = {19},
number = {8},
pages = {60--70},
abstract = {In this article I introduce {\em para-functional programming\/}, a
methodology for programming multiprocessor computing systems. It is based on a
functional programming model augmented with features that allow programs to be
mapped to specific multiprocessor topologies. The most significant aspect of
the methodology is that it treats the multiprocessor as a single autonomous
computer onto which a program is mapped, rather than as a group of independent
processors that carry out complex communication and require complex
synchronization. In more conventional approaches to parallel programming, the
latter method of treatment is often manifested as processes that cooperate by
message-passing. However, such notions are absent in para-functional
programming; indeed, a single language and evaluation model can be used from
problem inception, to prototypes targeted for uniprocessors, and ultimately to
realization on a parallel machine.},
keywords = {Para-Functional Programming, Annotations, Process Mapping},
}

@inproceedings{Huda86a,
author = {Hudak, Paul},
title = {{Arrays, Non-Determinism, Side-Effects, and Parallelism: A Functional
Perspective (Extended Abstract)}},
booktitle = {Graph Reduction, Proceedings of a Workshop},
address = {Santa Fe, New Mexico, September 29 -- October 1},
year = {1986},
volume = {279},
series = {Lecture Notes in Computer Science},
editor = {Fasel, Joseph H. and Keller, Robert M.},
publisher = {Springer, Berlin},
pages = {312--327},
abstract = {Purely functional incremental updates to arrays, executed in a
non-deterministic manner, are shown to achieve the same effect (in both
efficiency and functionability) as parallel assignment to imperative arrays.
The strategy depends only on the ability of a compiler to recognize that
incremental updates to functional arrays can be done destructively (an
optimization often called ``copy-avoidance'' of ``single-threaded arrays''.)
Special ``pseudo-functional syntax'' is introduced that captures typical
array-update patterns. This syntax falls somewhere between the purely
functional and (im)purely imperative, and makes the inferencing problem fairly
easy. If nothing else, the work represents an interesting intellectual
exercise in the relationship between non-determinism, side-effects, and
parallelism, and tends to blur the traditionally clear-cut distinction between
functional and imperative languages.},
keywords = {Non-Determinism, Arrays},
}

@inproceedings{HuGo85,
author = {Hudak, Paul and Goldberg, Benjamin},
title = {{Serial Combinators: Optimal Grains of Parallelism}},
booktitle = {{Functional Programming Languages and Computer Architecture}},
address = {Nancy, France, September 16--19},
year = {1985},
pages = {382--399},
series = {Lecture Notes in Computer Science},
editor = {Jouannaud, Jean-Pierre},
volume = {201},
publisher = {Springer, Berlin},
abstract = {A method is described for translating a high-level functional
program into combinators suitable for execution on multiprocessors with no
shared memory. It is argued that the granularity of the standard set of fixed
combinators is too fine, whereas the granularity of user-defined functions is
too coarse. The notion of a {\em serial combinator\/} is introduced that in
some sense has optimal granularity, and that takes into account pragmatic
issues such as the complexity of expressions and communication costs between
processors. The technique improves on the standard notion of {\em
super-combinators\/} by making them larger to retain locality and improve
efficiency, and by ensuring that they have no concurrent substructure that
could result in lost parallelism. Simulation results demonstrate improved
performance on both sequential and parallel computing models.},
keywords = {Parallelization, Serial Combinators},
}

@article{HuGo85a,
author = {Hudak, Paul and Goldberg, Benjamin},
title = {{Distributed Execution of Functional Programs Using Serial 
Combinators}},
journal = i3ec,
volume = {34},
number = {10},
month = {October},
year = {1985},
pages = {881--891},
abstract = {A general strategy for automatically decomposing and dynamically
distributing a functional program is discussed, suitable for parallel
execution on multiprocessor architectures with no shared memory. The strategy
borrows ideas from data flow and reduction machine research on one hand, and
from conventional compiler technology for sequential machines on the other.
One of the more troublesome issues in such a system is choosing the right
granularity for the parallel tasks. As a solution we describe a program
transformation technique based on {\em serial combinators\/} that offers in
some sense just the ``right'' granularity for this style of computing, and
that can be ``fine-tuned'' for particular multiprocessor architectures. We
show via simulation the success of our approach.},
keywords = {Compilation, Serial Combinators, Load Balancing},
}

@techreport{HuSu88,
author = {Hudak, Paul and Sundaresh, Raman S.},
title = {{On the Expressiveness of Purely Functional I/O Systems}},
institution = {Department of Computer Science},
address = {Yale University},
type = {Research Report},
number = {YALEU/DCS/RR-665},
month = {December},
year = {1988},
abstract = {Functional programming languages have traditionally lacked
complete, flexible, and yet referentially transparent I/O mechanisms. Previous
proposals for I/O have used either the notion of {\em lazy streams\/} or {\em
continuations\/} to model interaction with the external world. We discuss and
generalize these models and introduce a third, which we call the {\em
systems\/} model, to perform I/O. The expressiveness of the styles are
compared by means of an example. We then give a series of surprisingly simple
translations between the three models, demonstrating that they are not as
different as their programming styles suggest, and implying that the styles
could be mixed within a single program. The need to express non-deterministic
behavior in a functional language is well recognized. So is the problem of
doing so without destroying referential transparency. We survey past
approaches to this problem, and suggest a solution in the context of the I/O
models described. The I/O system of the purely functional language {\sc
Haskell} is presented. The system includes a rich set of operations, and
distinguishes between {\em file\/} and {\em channel\/} I/O. The approach to
non-determinism is also presented. A useful aspect of the design is that it
includes a rigorous specification of the behavior of the operating system,
thus precisely fixing the semantics of the various I/O operations. The {\sc
Haskell} I/O system is capable of supporting may other paradigms of concurrent
computation in a natural way. We demonstrate this through the emulation of
Actors, UNITY, CSP, CCS and Linda.},
keywords = {Streams, Continuations, Non-determinism},
}

@techreport{Iann88,
author = {Iannucci, Robert Alan},
title = {{A Dataflow/von Neumann Hybrid Architecture}},              
number = {MIT/LCS/TR-418},
institution = {Laboratory for Computer Science},
address = {Massachusetts Institute of Technology, Cambridge, MA},
month = {May},  
year = {1988},
abstract = {Dataflow architectures offer the ability to trade program-level
parallelism in order to overcome machine-level latencies in accessing memory
and in communicating with other processors. Dataflow further offers a uniform
synchronization paradigm, representing one end of spectrum wherein the unit of
scheduling is a single instruction. At the opposite extreme are the von
Neumann architectures which schedule on a task, or process, basis. As a basis
for scalable, general purpose multiprocessors, traditional von Neumann
architectures are unsuitable due to their inability to tolerate latency and to
provide means for fine-grained synchronization. This report examines the
spectrum by proposing a new architecture which is a {\em hybrid\/} of dataflow
and von Neumann organizations. The analysis attempts to discover those
features of the dataflow architecture, lacking in a von Neumann machine, which
are essential for tolerating latency and synchronization costs. These features
are captured in the concept of a {\em parallel machine language\/} which can
be grafted on top of an otherwise traditional von Neumann base. In such an
architecture, the units of scheduling, called {\em scheduling quanta\/}, are
bound at compile time rather than at instruction set design time. The parallel
machine language supports this notion via a large synchronization anme space.
It is shown that the combination of dataflow-style explicit synchronization
and von Neumann-style implicit synchronization in the same instruction set
results in an architectural synergism. Using an instruction set which is
strictly less powerful than that of the MIT Tagged-Token Dataflow Architecture
(TTDA), the hybrid architecture can exploit the same kinds of parallelism than
the TTDA. Given that a compiler can generate scheduling quanta of two or three
instructions, the hybrid architecture will execute approximately the same
number of instructions as the TTDA. Larger quanta can result in the hybrid
actually executing {\em fewer\/} instructions than the TTDA, demonstrating the
power of passing state implicitly between program-counter sequenced
instructions.},
keywords = {Dataflow, Hybrid, Compilation}
}

@inproceedings{Jaga86,
author = {Jagannathan, Suresh},
title = {{A Model of Data Backup and Recovery in a Computer System for 
Functional Programming}},
booktitle = {{Formal Description of Programming Concepts III}},
address = {Ebberup, Denmark, August 25--28},
year = {1986},
editor = {Wirsing, Martin},
publisher = {North-Holland, Amsterdam},
abstract = {We present a formal operational model of an abstract data flow
interpreter for an application programming language. This model serves as a
precise specification of an idealized implementation for the VIM computer
system, an experimental system under development at MIT intended to examine
the efficient implementation of functional languages using data flow
principles. The correctness of any implementation of VIM can be verified by
showing that it preserves all the invariants specified by this abstract
model. To demonstrate this technique, we present an extension of this
idealized system which contains backup and recovery facilities to guarantee
the security of all online data against loss or corruption as a result of
hardware malfunction. To show that the augmented system correctly preserves
the system state, we use as simple inductive proof technique that establishes
the equivalence of the behavior of the two systems. Our specification
language is a simple applicative language augmented with operations to perform
first-order existential and universal quantification.},
keywords = {Operating System, Streams, Operational Model},
}

@techreport{JHM*85,
author = {Jorrand, Ph. and Hufflen, J-M. and Marty, A. and Marty,
J-Ch. and Schnoebelen, Ph.},
title = {{FP2 --- The Language and its Formal Definition}},
type = {Research Report},
number = {LIFIA 26},
institution = {Laboratoire d' Informatique Fondamentale et d' Intelligence 
Artificielle},
address = {IMAG Informatique et Mathematiques Appliquees de Grenoble, France},
month = {May},
year = {1985},
abstract = {The purpose of this report is to provide both an informal
introduction to the main concepts of FP2 and a formal definition of its syntax
and semantics. {\em It must be noted that both the language FP2 and its formal
definition are still subject to change.\/} In particular, it is expected that
feedback from implementation and early use of FP2 will be the source of
modifications to the language.},
keywords = {FP2, Process Networks},
}

@techreport{Jorr84,
author = {Jorrand, Philippe},
title = {{FP2: Functional Parallel Programming Based on Term Substitution}},
type = {Research Report},
number = {LIFIA 15},
institution = {Laboratoire d' Informatique Fondamentale et d'
Intelligence Artificielle},
address = {IMAG Informatique et Mathematiques Appliquees de Grenoble, France},
month = {November},
year = {1984},
abstract = {The purpose of this paper is to present FP2 (Functional Parallel
Programming), a language where communication, functions and parallelism are
consistent notions: its design is simple and its formal semantics are
rigorously defined. This is made possibleby a systematic use of terms and term
substitutions for representing the basic notions of the language.},
keywords = {FP2, Process Networks},
}
 
@inproceedings{John85,
author = {Johnson, Thomas},
title = {{Lambda Lifting: Transforming Programs to Recursive Equations}},
booktitle = {{Functional Programming Languages and Computer Architecture}},
address = {Nancy, France, September 16--19},
year = {1985},
pages = {190--203},
series = {Lecture Notes in Computer Science},
editor = {Jouannaud, Jean-Pierre},
volume = {201},
publisher = {Springer, Berlin},
abstract = {Lambda lifting is a technique for transforming a functional
program with local function definitions, possibly with free variables in the
function definitions, into a program consisting only of global function
(combinator) definitions which will be used as rewrite rules. Different ways
of doing lambda lifting are presented, as well as reasons for rejecting or
selecting the method used in our Lazy ML compiler. A functional program
implementing the chosen algorithm is given.},
keywords = {Lambda Lifting, Program Transformation},
}

@article{JoSi89,
author = {Jones, S. B. and Sinclair, A. F.},
title = {{Functional Programming and Operating Systems}},
journal = compj,
volume = {32},
number = {2},
pages = {162--174},
month = {April},
year = {1989},
abstract = {There is a large class of programming problems to which, at first
sight, functional programming does not seem suited: interactive programs and
programs which must access external resources such as file stores or
communication systems. We take operating systems as the archetype of this
class of problems. We show how the lazy evaluation of functional programs, in
particular lazily evaluated infinite lists, or streams, can be exploited to
write interactive programs. This extends quite naturally to a software design
paradigm comprising processes and networks, to the control of peripheral
devices, and hence to the design of operating systems. We present a design for
a simple, single user, multiprogramming operating system. This demonstrates
workable solutions to operating system structuring, to low level device
interfacing, to high level application program interfacing, and to resource
management. We examine the paradigm illustrated, with reference to alternative
approaches which have been proposed.},
keywords = {Non-Determinism, Streams, Process Networks, Operating Systems},
}

@techreport{Joy90a,
author = {Joy, M. S.},
title = {{Parallel Evaluation of Functional Programs --- A Bibliography}},
institution = {Department of Computer Science},
address = {University of Warwick},
type = {Functional Language Implementation Project Document},
number = {18}, 
month = {April},
year = {1990},
abstract = {A bibliography of documents relating to the execution of
functional programs in a parallel environment is presented together with
abstracts of (some of) the documents.},
keywords = {Bibliography},
}

@inproceedings{KHM89,
author = {Kranz, David A. and {Halstead, Jr.}, Robert H. and Mohr, Eric},
title = {{Mul-T: A High-Performance Parallel Lisp}},
booktitle = {{SIGPLAN '89 Conference on Programming Language
Design and Implementation}},
address = {Portland, Oregon, June 21--23},
year = {1989},
publisher = {ACM Press, New York},
series = {SIGPLAN Notices},
number = {24(7)},
month = {July},
pages = {81--90},
abstract = {Mul-T is a parallel Lisp system, based on Multilisp's {\tt future}
construct, that has been developed to run on an Encore Multimax processor.
Mul-T is an extended version of the Yale T system and uses the T system's
ORBIT compiler to achieve ``production quality'' performance on stock hardware
--- about 100 times faster than Multilisp. Mul-T shows that futures can be
implemented cheaply enough to be useful in a production-quality system. Mul-T
is fully operational, including a user interface that supports managing groups
of parallel tasks.},
keywords = {Lisp, Futures},
}

@article{Kieb86,
author = {Kieburtz, Richard B.},
title = {{When Chasing your Tail Saves Time}},
journal = {Information Processing Letters},
volume = {23},
number = {6},
month = {December},
year = {1986},
pages = {321--324},
abstract = {Several investigators have observed experimentally that, in
evaluating functional language by graph reduction, the time required for
evaluation depends rather strongly on whether or not cyclic data structures
are used to represent recursively defined expressions. The result is
surprising, and the purpose of this short article is to explain it.},
keywords = {Circular Programs},
}
 
@incollection{Kuch89,
author = {Kuchen, Herbert},
title = {{Applikative Datenstrukturen in der parallelen abstrakten Maschine PAM
(Applicative Data Structures in the Parallel Abstract Machine PAM) (in
German)}},
booktitle = {{Funktionale und Logische Programmierung --- Sprachen, Methoden,
Implementationen}},
editor = {Dosch, Walter},
publisher = {Report 214, University Augsburg, Germany},
month = {December},
year = {1989},
abtract = {It is explained which requirements data structures for a functional
programming language must fulfill. The applicative data structures list,
sequence and distributed applicative array are compared according to these
demands. For some example programs the runtimes and speedups are given that
were measured for a parallel prototype implementation on a multitransputer
system.},
keywords = {Distributed Data Structures},
}

@inproceedings{KuLo89,
author = {Kuchen, Herbert and Loogen, Rita},
title = {{Parallele Implementierung einer funktionalen Programmiersprache auf
einem Transputer-Mehrprozessor-System (Parallel Implementation of a Functional
Programming Language on a Multitransputer-System) (in German)}},
booktitle = {{1. Transputer-Anwender-Treffen}},
series = {Informatik-Fachberichte},
publisher = {Springer, Berlin},
year = {1989},
abstract = {In this paper we describe the parallel implementation of a
functional programming language on a multitransputer system. First a
functional program is translated into a system of special function definitions
called serial combinators. Herewith the implicit parallelism to be utilized is
made explicit. The system of serial combinators is transformed into code for a
parallel abstract machine based on graph reduction. Furthermore some data
structures of the functional language are introduced and compared. For this
purpose, the runtimes and speedups measured for some example programs are
given.},
keywords = {Parallel Graph Reduction, Serial Combinators, Evaluation
Transformers, Distributed Data Structures}, 
}

@inproceedings{LaHi88,
author = {Larus, James R. and Hilfinger, Paul N.},
title = {{Restructuring Lisp Programs for Concurrent Execution}},
booktitle = {PPEALS 1988, Parallel Programming: Experience with Applications,
Languages and Systems},
address = {New Have, Connecticut, July 19--21},
year = {1988},
publisher = {ACM Press, New York},
pages = {100--110},
series = {SIGPLAN Notices},
volume = {23(9)},
abstract = {This paper describes the technique that the program transformation
system {\sc Curare} uses to restructure Lisp programs for concurrent execution
in Multiprocessor Lisp systems and discusses the problems inherent in
producing concurrent programs in a flexible and dynamic programming language
such as Lisp. {\sc Curare}'s overall organization is similar to other program
restructuring systems: it detects potential conflicts between statements in a
program, then transforms the program to improve its concurrent performance,
and finally inserts synchronization to ensure the program's concurrent
behavior. However, the language and programs that {\sc Curare} transforms are
very different from the FORTRAN programs that are the traditional targets of
program restructuring and so {\sc Curare} requires new algorithms and
approaches, which are described in this paper.},
keywords = {Lisp, Parallelization},
}

@inproceedings{Lee88,
author = {Lee, M. K. O.},
title = {{A Parallel Computational Model for Functional Languages and Its
Formal Specification in CSP}},
booktitle = {CONPAR 88},
address = {Manchester, UK, September 12--16},
year = {1988},
pages = {9--16},
volume = {C},
publisher = {British Computer Society},
abstract = {This paper presents a parallel and distributed computational model
for functional languages based on the notion of demand/data driven reduction
of graphical combinators. A graphical combinator language is used as the
canonical language form for the computational model. The behavior of the
model is formally described using Hoare's CSP constructive specification of a
detailed and practical parallel computational model of this kind. CSP is found
to be a suitable and valuable tool for conveying complex design ideas within a
parallel and distributed framework.},
keywords = {Semantics, CSP},
}

@article{LiHu86,
author = {Li, Kai and Hudak, Paul},
title = {{A New List Compaction Method}},
journal = {Software --- Practice and Experience},
volume = {16},
number = {2},
pages = {145--163},
month = {February},
year = {1986},
abstract = {List compaction, or so-called `cdr-coding', can greatly reduce the
storage needs of list processing languages. However, existing methods do not
perform well when several lists are being constructed simultaneously from the
same heap, since the non-contiguous nature of the cells being allocated
eliminates the opportunity for compaction. This situation arises not only in
true parallel systems sharing a common memory, and sequential systems
supporting multiple processes, but also quite often in purely sequential
systems, where it is not uncommon to build several different lists
simultaneously within a single loop. In this paper, a new list compaction
method is presented that performs well during both sequential and `parallel'
list generation. The method is essentially a generalization of cdr-coding, in
which lists are represented explicitly as linked vectors rather than
implicitly as compacted memory. In addition, an encoding scheme is used that
is simple or simpler than all known encodings, and destructive operations are
supported with no greater overhead than existing schemes. Performance figures
in a simulated environment suggest that the strategy consistently performs
better than conventional cdr-coding, with essential the same complexity.},
keywords = {List compaction},
}

@inproceedings{LKID89,
author = {Loogen, Rita and Kuchen, Herbert and Indermark, Klaus and Damm,
Werner},
title = {{Distributed Implementation of Programmed Graph Reduction}},
booktitle = {{PARLE '89, Parallel Architectures and Languages Europe, Volume I:
Parallel Architectures}},
address = {Eindhoven, The Netherlands, June 12--16},
year = {1989},
editor = {Odijk, E. and Rem, M. and Syre, J.-C.}, 
series = {Lecture Notes in Computer Science},
volume = {365},
publisher = {Springer, Berlin},
page = {136--157},
abstract = {Programmed graph reduction has been shown to be an efficient
implementation technique for lazy functional languages on sequential machines.
Considering programmed graph reduction as a generalization of conventional
environment-based implementations where the activation records are allocated
in a graph instead of on a stack it becomes very easy to use this technique
for the execution of functional programs in a parallel machine in distributed
memory. We describe in this paper the realization of programmed graph
reduction in PAM --- a parallel abstract machine with distributed memory.
Results of our implementation of PAM on an {\sc Occam}-Transputersystem are
given.}, 
keywords = {Programmed Graph Reduction, Serial Combinators, Evaluation
Transformers}, 
}

@techreport{Loog87,
author ={Loogen, Rita},
title = {{Design of a Parallel Programmable Graph Reduction Machine With
Distributed Memory}},
type = {Aachener Informatik-Berichte},
number = {87-11},
institution = {RWTH Aachen, Fachgruppe Informatik},
address = {Aachen, Germany},
year = {1987},
abstract = {This paper describes the systematic design of a parallel
programmable graph reduction machine with distributed memory. The abstract
architecture that is presented is not just one of many proposals in this area,
but highlights the essential features of such machines. Parallel program
execution will naturally cause additional organizational overhead (message
handling, task distribution, workload balancing) that will certainly slow down
the reduction process if only one processor is available in each node of the
multicomputer system. To avoid such loss in efficiency each processor node of
our abstract architecture consists of two processing units --- the reduction
unit and the input/output unit --- which work in parallel exchanging messages
in a shared local memory. The reduction unit performs the program evaluation
by programmed graph reduction, while the input/output unit keeps
organizational work that is important for the parallel program execution, but
can be done independently from reduction process, away from the reduction
unit. In addition to the gain in efficiency this organization of the abstract
architecture allows to discuss the issues of the graph reduction process, the
parallelization of this process and the structure of the communication network
separately.}, 
keywords = {Programmed Graph Reduction},
}

@inproceedings{MaHa87,
author = {Martin, Chris and Hankin, Chris},
title = {{Finding Fixed Points in Finite Lattices}},
booktitle = {{Functional Programming Languages and Computer Architecture}},
editor = {Kahn, Gilles},
year   = {1987},
address= {Portland, Oregon, USA, September 14--16},
volume = {274},
series = {Lecture Notes in Computer Science},
publisher =  {Springer, Berlin},
pages  = {426--445},
abstract = {Recently there has been much interest in the abstract
interpretation of declarative languages. Abstract interpretation is a
semantics-based approach to program analysis that uses compile time evaluation
of programs using simplified value domains. This gives information about the
run-time properties of programs and provides the basis for significant
performance improvements. A particular example of abstract interpretation is
strictness analysis which allows the detection of parameters in which a
function is strict; these parameters may be passed by value without
compromising the termination properties of the program. The central, most
complex task of an abstract interpreter is finding the fixpoints of recursive
functions in the abstract value space. An elegant algorithm, the frontiers
algorithm, has been proposed by Simon Peyton-Jones and Chris Clack that
performs very well for the strictness analysis of first-order functions. In
this paper we extend their algorithm and show how it can be applied to
higher-order functions over arbitrary fine lattices. This raises the
possibility of using the algorithm as the basis for more general abstract
interpretation tools. We describe the algorithm in a modular way that is
conducive to proofs of correctness and termination properties.},
keywords = {Abstract Interpretation, Fixed Points, Frontiers Algorithm},
}

@inproceedings{Mara91,
author = {Maranget, Luc},
title = {{GAML: a Parallel Implementation of Lazy ML}},
booktitle = {{FPCA '91, Functional Programming Languages and
Computer Architectures}},
address = {Harvard, Massachusetts, USA},
year = {1991},
series = {Lecture Notes in Computer Science},
volume = {523},
editor = {Hughes, John},
publisher = {Springer, Berlin},
pages = {102--123},
abstract = {We present a new parallel implementation of Lazy ML. Our scheme is
a direct extension of the G-machine-based implementation of Lazy ML.
Parallelism is introduced by {\em fork\/} annotations inserted by the
programmer. We discuss the interference of such user annotations with
strictness annotations generated by our compiler. The system has been
implemented on a Sequent Balance computer. We also address the main practical
issues involved, including stack and heap management.},
keywords = {Parallel Graph Reductoin, G-Machine, Annotations},
}

@techreport{Merr91,
author = {Merrall, Simon},
title = {{TPL --- An Implementation of Paralation Lisp in EuLisp}},
institution = {School of Mathematical Sciences},
address = {University of Bath},
year = {1991},
abstract = {The department has recently acquired a Maspar MP-1, a massively
parallel SIMD computer which has 1024 processors scalable to 16384 processors.
We have been looking for an appropriate model of SIMD computation to allow us
to use the MasPar with EuLisp alongside our existing MIMD models. In this
paper we describe the Paralation Model and an experimental serial
implementation of Paralation Lisp written in EuLisp.},
keywords = {Lisp, SIMD},
}

@techreport{Merr91a,
author = {Merrall, Simon},
title = {{Plurals --- A SIMD extension to Eulisp}},
institution = {School of Mathematical Sciences},
address = {University of Bath},
year = {1991},
abstract = {We describe in this report a massively parallel SIMD extension to
EuLisp our local parallel dialect of Lisp, called Plurals. The extensions
allow the programmers to allocate Lisp objects on a data parallel processor
array, in our case a MasPar MP-10111. The physical size of the processor array
is hidden from the programmer who can request different sized sets of virtual
processing elements. This gives a mechanism for working with datasets with
fewer elements than the processor array whilst leaving the remaining PEs free
for other datasets. This has removed us one level from the traditional global
execution model of data parallel processing.},
keywords = {Lisp, SIMD},
}

@article{MiMo90,
author = {Milewski, Jaroslaw and Mori, Imrich},
title = {{Efficient Loop Handling in a Stream-Oriented Unraveling Dataflow
Interpreter}},
journal = {Computers and Artificial Intelligence},
volume = {9},
number = {6},
year = {1990},
pages = {545--570},
abstract = {We propose a detailed implementation of an unraveling dataflow
interpreter of the Id programming language, based on table encoding of
activation names. We concentrate on loop programming construct and on
label-manipulating operators of the underlying dataflow model. Moreover, we
show how to represent and process the stream, being the basic data structure
in classical Id. Since streams are processed by loops, the algorithmic
attractiveness of the former meets in our model the efficient implementation
of the latter.},
keywords = {Dataflow, Id, Streams},
}

@inproceedings{Moor82,
author = {Moor, Ian W.},
title = {{An Applicative Compiler for a Parallel Machine}},
booktitle = {{ACM SIGPLAN Symposium on Compiler Construction}},
address = {Boston, Massachusetts},
series = {SIGPLAN Notices},
number = {17(6)},
month = {June},
year = {1982},
pages = {284--293},
publisher = {ACM Press, New York},
note = {Also: Research Report Doc 83/6, Department of Computing, Imperial
College, London, March 1983},
abstract = {A compiler for the applicative language HOPE is described. The
compiler is itself written in HOPE and generates a machine independent
compiler target language, suitable for execution on the parallel reduction
machine ALICE. The advantages of writing a compiler in a very high level
applicative language ae discussed, and the use of program transformation and
other techniques to turn the initial ``runnable specification'' into a more
efficient (if less clear) program are outlined. Extensions to the HOPE
language and the compiler which can exploit the parallelism and various
execution mode of ALICE are described.},
keywords = {Graph Reduction, ALICE, HOPE},
}

@techreport{MoTo92,
author = {Morrisett, J. Gregory and Tolmach, Andrew},
title = {{A Portable Multiprocessor Interface for Standard ML of New Jersey}},
number = {CMU-CS-92-155},
institution = {School of Computer Science},
address = {Carnegie Mellon University, Pittsburgh, PA},
month =	 {June},
year = {1992},
abstract = {We have designed a portable interface between shared-memory
multiprocessors and Standard ML of New Jersey. The interface is based on the
conventional kernel thread model and provides facilities that can be used to
implement user-level thread packages. The interface supports experimentation
with different thread scheduling policies and synchronization constructs. It
has been ported to three different multiprocessors and used to construct a
general purpose, user-level thread package. In this paper, we discuss the
interface and its implementation and performance, with emphasis on the Silicon
Graphics 4D/380S multiprocessor.},
keywords = {ML, Threads},
}

@techreport{MSA*85,
author = {McGraw, James and Skedzielewski, Stephen and Allan, Stephen
and Oldehoeft, Rod and Glauert, John and Kirkham, Chris and
Noyce, Bill and Thomas, Robert},
title = {{SISAL --- Streams and Iteration in a Single Assignment Language,
Language Reference Manual, Version 1.2}},
number = {M-146},
institution = {Lawrence Livermore National Laboratory},
address = {University of California},
month = {March},
year = {1985},
abstract = {SISAL (Streams and Iteration in a Single-Assignment Language) is a
functional data-flow language intended for use on a variety of sequential,
vector, multi-, and data-flow processors. SISAL is designed to express
algorithms fore execution on computers capable of highly concurrent operation.
More specifically, the application area to be supported is numerical
computation which strains the limits of high performance machines, and the
primary targets for translation of SISAL programs are dataflow data-driven
machines. Nevertheless, it has been our intention that the language not have
idiosyncrasies reflecting the particular nature of the application area or
target machine. It should be reasonable for SISAL to evolve into a general
purpose language appropriate for writing programs to run on future general
parallel computers.},
keywords = {SISAL, Dataflow},
}

@techreport{Nikh87a,
author = {Nikhil, Rishiyur S. },
title = {{Id World Reference Manual (for Lisp Machines)}},
type = {Computation Structures Group Memo},
number = {280},
institution = {Laboratory for Computer Science},
address = {Massachusetts Institute of Technology, Cambridge, Massachusetts},
month = {December},
year = {1987},
abstract = {Id World is a powerful, integrated programming environment
produced by the Computation Structures Group at MIT's Laboratory for Computer
Science for the experimenter who wishes to prepare and execute parallel
programs and study their behavior, as quickly as possible. Id World provides a
powerful and convenient means to prototype parallel programs rapidly and to
study these questions. Programs are written in the parallel programming
language Id Nouveau and executed on GITA, an emulation of the MIT Tagged-Token
Dataflow Architecture. GITA is heavily instrumented to measure many
interesting aspects of parallelism.},
keywords = {Dataflow, Id, Performance Monitoring, Prototyping},
}

@techreport{Nikh87b,
author = {Nikhil, Rishiyur S. },
title = {{Id Nouveau, Reference Manual, Part I: Syntax}},
institution = {Computation Structures Group, Laboratory for Computer Science},
address = {Massachusetts Institute of Technology, Cambridge, Massachusetts},
month = {April},
year = {1987},
abstract = {This document is Part I of two reference manuals for Id Noveau,
the language used to program the MIT Tagged-Token Dataflow Machine. This part
describes the syntax of the language. Id Noveau is a functional programming
language augmented with a parallel data-structuring mechanism called {\em
I-structures\/}.},
keywords = {Dataflow, Id Noveau},
}

@techreport{Nikh87c,
author = {Nikhil, Rishiyur S. },
title = {{Id Nouveau, Reference Manual, Part II: Operational Semantics}},
institution = {Computation Structures Group, Laboratory for Computer Science},
address = {Massachusetts Institute of Technology, Cambridge, Massachusetts},
month = {April},
year = {1987},
abstract = {This document is Part I of two reference manuals for Id Noveau,
the language used to program the MIT Tagged-Token Dataflow Machine. This part
describes the operational semantics of the language. Id Noveau is a functional
programming  language augmented with a parallel data-structuring mechanism
called {\em I-structures\/}.},
keywords = {Dataflow, Id Noveau, Operational Semantics},
}

@techreport{Nikh88,
author = {Nikhil, Rishiyur S. },
title = {{ID (Version 88.0) Reference Manual}},
type = {Computation Structures Group Memo},
number = {284},
institution = {Laboratory for Computer Science},
address = {Massachusetts Institute of Technology, Cambridge, MA},
month = {March},
year = {1988},
abstract = {Id is a parallel programming language designed by members of the
Computation Structures Group of MIT/LCS. It is used for programming dataflow
and other parallel machines. Id is a functional language augmented with a
parallel data-structuring mechanism called {\em I-structures\/}.},
keywords = {Dataflow, Id},
}

@inproceedings{PaCu90,
author = {Papadopoulos, Gregory M. and Culler, David E.},
title = {{Monsoon: An Explicit Token-Store Architecture}},
booktitle = {{17th International Symposium on Computer Architecture}},
address = {Seattle, Washington, May 28--31},
year = {1990},
pages = {82--91},
number = {18(2)},
series = {ACM SIGARCH Computer Architecture News},
month = {June},
abstract = {Dataflow architectures tolerate long unpredictable communication
delays and support generation and coordination of parallel activities directly
in hardware, rather than assuming that program mappings will cause these
issues to disappear. However, the proposed mechanisms are complex and
introduce new mapping complications. This paper presents a greatly simplified
approach to dataflow execution, called the {\em explicit token store\/} (ETS)
architecture, and its current realization in {\em Monsoon\/}. The essence of
dynamic dataflow execution is captured by a simple transition on state bits
associated with storage local to a processor. Low-level storage management is
performed by the compiler in assigning nodes to slots in an {\em activation
frame\/}, rather than dynamically in hardware. The processor is simple, highly
pipelined, and quite general. It may be viewed as a generalization of a fairly
primitive von Neumann architecture. Although the addresssing capability is
restrictive, there is exactly one instruction executed for each action on the
dataflow graph. Thus, the machine oriented ETS model provides new
understanding of the merits and the real cost of direct execution of dataflow
graphs.}, 
keywords = {Dataflow, Compilation, Abstract Machine},
}              

@phdthesis{Papa88,
author = {Papadopoulos, Gregory Michael},
title = {{Implementation of a General Purpose Dataflow Multiprocessor}},
school = {Department of Electrical Engineering and Computer Science},
address = {Massachusetts Institute of Technology, Cambridge, MA},
month = {August},
year = {1988},
abstract = {General purpose multiprocessors have largely failed to meet
expectations for programmability and performance. We blame the lack of usable
parallel programming languages and systems on the underlying processor
architecture. Machines built out of conventional sequential processors simply
do not support the synchronization demands of parallel execution, so the
programmer focuses upon the dangerous and arduous task of discovering a
minimum set of synchronization points without introducing nondeterminism. We
argue that processors must be fundamentally changed to execute a parallel
machine language, in which parallel activities are coordinated as efficiently
as instructions are scheduled. Dataflow architectures address this challenge
by radically reformulating the basic specification of a machine program. These
machines {\em directly execute\/} dataflow graphs, which specify only the
essential prerequisites for the execution of an instruction --- the
availability of operands. Unfortunately, dataflow machines, including the
M.I.T. Tagged Token Dataflow Architecture (TTDA), have labored under a number
of implementation burdens, notably the apparent need for a fully associative
operand matching store which discovers when instructions are able to execute.
We introduce and develop a novel dataflow architecture, the Explicit Token
Store (ETS), which directly executes tagged-token dataflow graphs while
correcting a number of inherent inefficiencies of previous dataflow machines.
In the ETS model, operand matching is performed at compiler-designated offsets
within an activation frame. We show that the ETS is compatible with the TTDA
by giving translations from TTDA machine graphs to ETS machine graphs.
Finally, we describe an implementation of an ETS dataflow multiprocessor,
called Monsoon, now under construction.},
keywords = {Dataflow, Compilation, Id},
}

@inproceedings{PaMe91,
author = {Padget, Julian and Merrall, Simon},
title = {{Bridging the MIMD--SIMD Gap}},
booktitle = {{Workshop on Abstract Machine Models for Parallel Computation}},
year = {1991},
abstract = {This paper concerns some programming models for SIMD architectures
to support symbolic processing. We report some initial experience with the
MasPar array processor and the development of an implementation of EuLisp,
extended by the paralation data structure as the first stages in this
exercise. In particular, it seems that the paralation structure, although
originally conceived for array architectures (viz.\ the Connection Machine),
could be applied equally well to MIMD process management. We describe the
notion of the active paralation as a mechanism to bridge the gap between SIMD
and MIMD architectures.},
keywords = {Lisp, SIMD, MIMD},
}

@techreport{Perr88a,
author = {Perry, Nigel},
title = {{I/O and Inter-language Calling for Functional Languages}},
institution = {Department of Computing},
address = {Imperial College, London, UK},
year = {1988},
abstract = {This paper discusses the provision of I/O and inter-language
calling facilities in pure functional languages. In the introduction we first
describe the scope of ``I/O'' by specifying a number of broad operation
categories. We then describe why the ``purity'' of functional languages poses
problems in providing ``safe'' I/O --- compared to the same task in an
imperative language. The first section is a survey existing I/O proposals for
functional languages which fall into two groups; environment passing where the
file system, I/O devices etc., are passed around as either an explicit or
implicit abstract data structure by the program. Secondly stream based
systems, where all I/O is channelled over ``streams'' which either carry basic
values or more complex data types representing operation request to the
O/S. In the second section we define a new system, termed {\em result
continuations\/}, which provides a complete and safe I/O system. We show how
all the operation categories are handled by this system and prove that the
system is safe. The final section looks at providing inter-languages
calling between functional and imperative languages. We show how, by using
result continuations, an interface may be provided which does not compromise
the referential transparency of the functional language.},
keywords = {Operating Systems, Non-determinism, Result Continuations},
}


@techreport{Perr88b,
author = {Perry, Nigel},
title = {{Towards a Functional Operating System}},
institution = {Department of Computing},
address = {Imperial College, London, UK},
year = {1988},
abstract = {In this short note we discuss an operating system design based
around our previously proposed {\em result continuations\/} thus avoiding the
problems of stream I/O. To handle non-determinism a mall preemptive process
scheduler is defined which is written in assembler and {\em not\/} a
functional language, thus avoiding introducing a non-referentially transparent
construct into the language. This two tiered approach is not unusual for an
operating system written in a high-level language, c.f. Unix.},
keywords = {Operating Systems, Non-Determinism, Result Continuations},
}

@inproceedings{Perr92, 
author = {Perry, Nigel}, 
title = {{Towards a Concurrent Object/Process Oriented Functional Language
(Short Version)}},  
booktitle = {15th Australian Computer Science Conference}, 
volume = {14(1)}, 
series = {Australian Computer Science Communications}, 
month = {January},
year = {1992},
 abstract = {In this paper we first define the programming
methodology continuation passing style (CSP), which originates from tail
functions and upon which result continuations are based, and then give an
introduction to functional language I/O and inter-language working. We then
show how the result continuation mechanism can be extended to support
coroutine programming, and then use this to define remote function call (RPC)
and object style systems. Next we show how concurrent functional programming
can be supported using the result continuation mechanism, this work stems from
our sketch of a functional operating system described in Perry. In conclusion
we describe how this paper lays the groundwork for the development of
concurrent functional and object/process-oriented programming paradigms and
support non-determinism. A semantics for result continuations,
functional coroutines, remote function calls, and concurrent functional
programming is being developed. The semantics are defined using a version of
the $\pi$-calculus and enable concurrent functional programs to be reasoned
about.  This reasoning can either be at two levels; using standard functional
notation ($\lambda$-calculus) reasoning for the individual processes and
$\pi$-calculus style reasoning for the concurrent communication, treating each
process as a ``black box''; or at a single level by implementing the
$\lambda$-calculus in the $\pi$-calculus.},
keywords = {Continuation Passing, Non-Determinism, Concurrent Objects,
Parallel Semantics}, 
}

@inproceedings{PeSa89,
author = {Peyton Jones, Simon L. and Salkild, Jon},
title = {{The Spineless Tagless G-machine}},
booktitle = {{FPCA '89, Functional Programming
Languages and Computer Architecture}},
address = {London, UK, September 11--13},
year = {1989},
publisher = {ACM Press, New York},
abstract = {The Spineless Tagless G-machine is an abstract machine based on
graph reduction, designed as a target for compilers for non-strict functional
languages. As its name implies, it is a development of earlier work,
especially the G-Machine and TIM. It has a number of unusual features: the
abstract machine code is rather high-level than is common, allowing better
code generation; the representation of the graph eliminates most interpretive
overheads; vectored returns from data structures give fast case-analysis; and
the machine is readily extended for a parallel implementation.},
keywords = {Parallel Graph Reduction, G-Machine},
}

@article{Peyt89,
author = {Peyton Jones, Simon L.},
title = {{Parallel Implementations of Functional Programming Languages}},
journal = {The Computer Journal},
volume = {32},
number = {2},
pages = {175-186},
month = {April},
year = {1989},
abstract = {One of the most attractive features of functional programming
languages is their suitability for programming parallel computers. This paper
is devoted to discussion of such a claim. Firstly, parallel functional
programming is discussed from the programmer's point of view. Secondly, since
most parallel functional language implementations are based on the concept of
graph reduction, the issues raised by graph reduction are discussed. Finally,
the paper concludes with a case study of a particular parallel graph reduction
machine and a survey of other parallel architectures.},
keywords = {Parallel Graph Reduction, Para-Functional Programming, Dataflow}
}

@inproceedings{Peyt91,
author = {{Peyton Jones}, Simon L.},
title = {{The Spineless Tagless G-machine: Second Attempt}},
booktitle = {3rd International Workshop on the Parallel Implementation of
Functional Languages},
address = {Southampton, UK, June 5--7},
year = {1991},
editor = {Glaser, Hugh and Hartel, Pieter},
publisher = {Technical Report CSTR 91-07, University of Southampton},
pages = {147--192},
abstract = {The Spineless Tagless G-Machine is an abstract machine designed to
support functional languages. This presentation of the machine falls into two
parts. Firstly, we present the {\em STG language\/}, an austere but
recognizably-functional language, which as well as a {\em denotational\/}
meaning has a well-defined {\em operational\/} semantics. The STG language is
the ``abstract machine code'' for the Spineless Tagless G-machine, but it is
sufficiently abstract that it can readily be compiled into G-machine G-code or
TIM code instead. Secondly, we discuss the mapping of the STG language onto
stock hardware. The success of an abstract machine model depends largely on
how efficient this mapping can be made, though this topic is often relegated
to a short section. Instead, we give a detailed discussion of the design
issues and the choices we have made.  Our principal target is the C language,
treating the C compiler as a portable assembler.}, 
keywords = {Graph Reduction, Abstract Machine}, 
}

@book{PeLe92,
author = {{Peyton Jones}, Simon L. and Lester, David R.},
title = {{Implementing Functional Languages: a tutorial}},
publisher = {Prentice Hall},
year = 	 {1992},
abstract = {This book gives a practical approach to understanding
implementations of non-strict functional languages using lazy graph reduction.
The book is intended to be a source of practical labwork material, to help
make functional-language implementations ``come alive'', by helping the reader
to develop, modify and experiment with some non-trivial compilers. The unusual
aspect of the book is that it is meant to be {\em executed\/} as well as {\em
read\/}. Rather than merely presenting an abstract description of each
implementation technique, we present the code for a complete working prototype
of each major method, and then work through a series of improvements to it.
All of the code is available in machine-readable form. A section of the book
is devoted to the development of an abstract machine and a compiler for a
parallel G-machine.},
keywords = {Parallel Graph Reduction},
}

@techreport{Repp91,
author = {Reppy, John H.},
title = {{Concurrent Programming with Events -- The Concurrent ML Manual}},
type = {Reference Manual},
number = {Version 0.9.6},
institution = {Department of Computer Science},
address = {Cornell University, Ithaca, NY},
month = {October},
year = {1991},
abstract = {Concurrent ML (CML) is a system for concurrent programming in
Standard ML (SML). A CML program consists of a set of {\em threads\/} (or
light-weight processes). A thread is the sequential evaluation of a ML
expression. It does not have to be a terminating computation; in fact,
infinitely looping threads are often useful. The evaluation of a thread may
involve communication with other threads, which is done by sending a message
on a channel. Message passing is synchronous and forms the basis of
communication and synchronization in CML. This model is extended by {\em
first-class synchronous operations\/}, which provide a mechanism for building
new synchronization and communication abstractions.},
keywords = {ML, Threads},
}

@inproceedings{Repp91a,
author = {Reppy, J. H.},
title = {{CML: A Higher-Order Concurrent Language}},
booktitle = {SIGPLAN '91 Conference on Programming Language Design and
Implementation}, 
address = {Toronto, Ontario, Canada, June 26--28},
year = {1991},
series = {SIGPLAN Notices},
volume = {26(6)},
pages = {294--305},
publisher = {ACM Press, New York},
abstract = {Concurrent ML (CML) is a high-level, high-performance language for
concurrent programming. It is an extension of Standard ML (SML) and is
implemented on top of Standard ML of New Jersey (SML/NJ). CML is a practical
language and is being used to build real systems. It demonstrates that we need
not sacrifice high-level notation in order to have a good performance.},
keywords = {ML, Threads},
}

@techreport{Rich84a,
author = {{Richards Jr.}, Hamilton},
title = {{An Applicative Programming Languages Bibliography}},
institution = {Burroughs Corporation},
address = {Austin Research Center, Austin, Texas},
month = {October},
year = {1984},
keywords = {Bibliography},
}

@inproceedings{RuSa87,
author = {Ruggerio, Carlos A. and Sargeant, John},
title = {{Control of Parallelism in the Manchester Dataflow Machine}},
booktitle = {FPCA '87, Functional Programming Language and Computer
Architecture}, 
address = {Portland, Oregon, September 14--16},
year = {1987},
editor = {Kahn, Gilles},
volume = {274},
series = {Lecture Notes in Computer Science},
publisher = {Springer, Berlin},
pages = {1--15},
abstract = {Fine-grain parallel machines, such as tagged-token dataflow
machines, allow very high degrees of program parallelism to be exploited for
many applications. In fact, so much parallelism can be generated that it is
necessary to {\em control parallelism\/} in order to bound storage use. This
paper reviews software mechanisms for parallelism control, which rely on
merely planting extra code to control execution order. Such methods are found
to be inadequate, so a fundamental architectural mechanism known as throttle
is considered necessary. Various attempts to design a throttle for the
Manchester Dataflow Machine are described. The eventual solution, a
coarse-grain, process-based throttle, is explained, and simulation results are
presented which demonstrate its effectiveness.},
keywords = {Parallelism Control},
}

@inproceedings{SaHe86a,
author = {Sarkar, Vivek and Hennessy, John},
title = {{Partitioning Parallel Programs for Macro-Dataflow}},
booktitle = {{1986 ACM Symposium on Lisp and Functional Programming}},
address = {Cambridge, Massachusetts, August 4--6},
year = {1986},
publisher = {ACM Press, New York},
pages = {202--211},
abstract = {Partitioning techniques are necessary to execute functional
programs at a coarse granularity. Fine granularity execution is inefficient on
general purpose multiprocessors. There is a trade-off between parallelism and
the overhead of exploiting parallelism. We present a compile-time partitioning
approach to achieve this trade-off.},
keywords = {Dataflow, Partitioning}
}

@inproceedings{SSM88,
author = {Sarkar, V. and Skedzielewski, S. and Miller, P.},
title = {{An Automatically Partitioning Compiler for Sisal}},
booktitle = {CONPAR 88},
address = {Manchester, UK, September 12--16},
year = {1988},
pages = {26--33},
volume = {B},
publisher = {British Computer Society},
abstract = {In this paper, we describe a compiler that automatically
repackages Sisal programs to achieve the grain size needed to run efficiently
on different multiprocessors. The compiler is based on an existing
implementation for Sisal on the Sequent Balance multiprocessor, and on previous
work on automatic partitioning of Sisal programs.},
keywords = {Partitioning, Compilation},
}

@inproceedings{ScSc87,
author = {Schaefer, Peters and Schnoebelen, Philippe},
title = {{Specification of a Pipelined Event Driven Simulator using FP2}},
booktitle = {{PARLE '87, Parallel Architectures and Languages Europe, Volume 1:
Parallel Architectures}},
editor = {de Bakker, J. W. and Nijman, A. J. and Treleaven, P. C.},
address = {Eindhoven, The Netherlands, June 15--19},
year = {1987},
pages = {311--328},
series = {Lecture Notes in Computer Science},
volume = {258},
publisher = {Springer, Berlin},
abstract = {This paper deals with the specification of a pipelined event
driven algorithm for discrete simulation using the Functional Parallel
Programming Language FP2. We start with a short review of the pipelined event
driven simulation algorithm and the main features of FP2. In the main part,
the FP2 specification procedure and a proof methodology for liveness and
correct termination of the specified network of processes are described.},
keywords = {FP2, Process Networks, Verification},
}

@inproceedings{SCE91,
author = {Schauser, Klaus Erik and Culler, David E. and {von Eicken}, 
Thorsten},
title = {{Compiler-Controlled Multithreading for Lenient Parallel Languages}},
booktitle = {{Functional Programming Languages and Computer Architectures}},
address = {Harvard, Massachusetts, USA},
year = {1991},
series = {Lecture Notes in Computer Science},
volume = {523},
publisher = {Springer, Berlin},
pages = {50--71},
abstract = {Tolerance to communication latency and inexpensive synchronization
are critical for general-purpose computing on large multiprocessors. Fast
dynamic scheduling is required for powerful non-strict parallel languages.
However, machines that support rapid switching between multiple execution
threads remain a design challenge. This paper explores how multithreaded
execution can be addressed as a compilation problem, to achieve switching
rates approaching what hardware mechanisms might provide. Compiler-controlled
multithreading is examined through compilation of a lenient parallel language,
Id90, for a threaded abstract machine, TAM. A key feature of TAM is that
synchronization is explicit and occurs only at the start of a thread, so that
a simple model can be applied. A scheduling hierarchy allows the compiler to
schedule logically related threads closely together in time and to use
registers across threads. Remote communication is via message sends and
split-phase memory accesses. Messages and memory replies are received by
compiler-generated message handlers which rapidly integrate these events with
thread scheduling. To compile Id90 for TAM, we employ a new parallel
intermediate form, dual-graphs, with distinct control and data arcs. This
provides a clean framework for partitioning the program into threads,
scheduling threads, and managing registers under asynchronous execution. The
compilation process is described and preliminary measurements of its
effectiveness are discussed. Dynamic execution measurements are obtained via a
second compilation step, which translates TAM into native code for existing
machines with instrumentation incorporated. These measurements show that the
cost of compiler-controlled multithreading is within a small factor of the
cost of control flow in sequential languages.},
keywords = {Lenient Languages, Scheduling, Target Language},
}

@phdthesis{Sijt88,
author = {Sijtsma, B.A.},
title = {{Verification and Derivation of Infinite-List Programs}},
school = {Groningen University, The Netherlands},
year = {1988},
month = {October},
abstract = {We will discuss various aspects of infinite-list programs. More
specifically, we will discuss infinite-list programs written in a functional
programming language with lazy evaluation. Our aim is mainly the
verification of existing infinite-list programs. We present three related
methods for deriving infinite-list programs from specifications. The methods
are such that the verification and derivation go hand in hand; no separate
verification is needed. These methods are in fact theorems and can thus be
used to verify existing infinite-list programs. We also give a method
which can be only used in special cases. In these special cases also the
one of the general methods could have been used, but the special method
generally leads to far more efficient programs. Finally, we derive a program
for a certain class of infinite-list problems. This derivation illustrates how
the scope of the methods can be extended.},
keywords = {Infinite Lists, Circular Programs, Verification},
}

@article{Sijt89,
author = {Sijtsma, Ben A.},
title = {{On the Productivity of Recursive List Definitions}},
journal = toplas,
month = {October},
year = {1989},
volume = {11},
number = {4},
pages = {633--649},
abstract = {Several related notions of the productivity are presented for
functional languages with lazy evaluation. The notion of productivity captures
the idea of computability, of progress of infinite-list programs. If an
infinite-list program is {\em productive\/}, then every element of the list
can be computed in finite ``time''. These notions are used to study recursive
list definitions, that is, lists defined by $l$ where $l=fl$. Sufficient
conditions are given in terms of the function $f$ that either guarantee the
productivity of the list or its unproductivity. Furthermore, a calculus is
developed that can be used in verifying that lists defined by $l\ \mbox{\it
where}\ l = fl$ are productive. The power and the usefulness of our theory are
demonstrated by several nontrivial examples. Several observations are given in
conclusion.}, 
keywords = {Circular Programs},
}

@techreport{SkGl85,
author = {Skedzielewski, Stephen and Glauert, John},
title = {{IF1 --- An Intermediate Form for Applicative Languages, Version
1.0}},
institution = {Lawrence Livermore National Laboratory},
number = {M-170},
address = {University of California},
month = {July},
year = {1985},
abstract = {The intent of IF1 is to define a language that can be the target
of several compilers for applicative languages. We hope to be able to exchange
programs translated from different sources via IF1, and share the rather
limited resource of programs written in applicative languages. The language
that we define is not sufficiently general to be used by implementors of all
applicative languages; it is strongly based on the features of SISAL and VAL.
It is our hope that IF1 will be sufficient to form the basis for other, more
comprehensive intermediate forms for a larger set of applicative languages.},
keywords = {Intermediate Language, SISAL},
}

@inproceedings{SkWe85,
author = {Skedzielewski, Stephen and Welcome, Michael},
title = {{Data Flow Graph Optimization in IF1}},                       
editor = {Jouannaud, Jean-Pierre},
booktitle = {{FPCA '85, Functional Programming Languages and Computer
Architecture}},
address = {Nancy, France, September 16--19},
year = {1985},
pages = {17--34},
series = {Lecture Notes in Computer Science},
volume = {201},
publisher = {Springer, Berlin},
note = {Also: Technical Report UCRL-92122, Lawrence Livermore National 
Laboratory, University of California},
abstract = {Optimization techniques are as important when compiling dataflow
languages as when compiling conventional languages. This paper describes work
that has been done on optimizers for SISAL programs that have been translated
into IF1 data flow graphs. It shows that conventional optimization algorithms
can be easily and efficiently implemented for data flow graphs, and that the
payoff for even simple optimizations can be significant.},
keywords = {Compilaton, Dataflow, SISAL}
}

@inproceedings{StHi86,
author = {Steele Jr., Guy L. and Hillis, W. Daniel},
title = {{Connection Machine Lisp: Fine-Grained Parallel Symbolic
Processing}}, 
booktitle = {1986 ACM Conference on Lisp and Functional Programming},
address = {Cambridge, Massachusetts, August 4--6},
publisher = {ACM Press, New York},
year = {1986},
pages = {279--297},
abstract = {Connection Machine Lisp is a dialect of Lisp extended to allow a
fine-grained, data-oriented style of parallel execution. We introduce a new
data structure, the xapping, that is like a sparse array whose elements can be
processed in parallel. This kind of processing is suitable for implementation
by such fine-grained parallel computers as the Connection Machine System and
NON-VON. Additional program notation is introduced to indicate various
parallel operations. The symbols $\alpha$ and $\circ$ are used, in a manner
syntactically reminiscent of the backquote notation in Common Lisp, to
indicate what parts of an expression are to be executed in parallel. The
symbol $\beta$ is used to indicate permutation and reduction of sets of data.
Connection Machine Lisp in practice leans heavilyon APL and FP and their
descendants. Many ideas and stylistic idioms can be carried over directly.
Some idioms of Connection Machine Lisp are difficult to render in APL because
Connection Machine Lisp xappings may be sparse while APL vectors are not
sparse. We give many small examples of programming in Connection Machine Lisp.
Two metacircular interpreters for a subset of Connection Machine Lisp are
presented. One is concise but suffers from defining $\alpha$ in terms of
itself in such a way as to obscure its essential properties. The other is
longer but facilitates presentation of these properties.},
keywords = {Lisp, SIMD, Xappings},
}

@article{Take87,
author = {Takeichi, Masato},
title = {{Partial Parameterization Eliminates Multiple Traversals of Data
Structures}}, 
journal = acta,
year = {1987},
volume = {24},
pages = {57--77},
abstract = {The use of higher order functions in functional programming opens
up the possibility of defining functions by partial parameterization, and lazy
evaluation brings out a new approach in programming methodology. This paper
describes a new transformation technique based on partial parameterization and
fully lazy evaluation for eliminating multiple traversals of data structures.
It uses no particular mechanisms in functional programming, whereas it
transforms a wider class of programs into efficient ones than that proposed so
far.}, 
keywords = {Partial Parameterization, Circular Programs},
}

@techreport{Trau86,
author = {Traub, Kenneth R.},
title = {{A Compiler for the MIT Tagged-Token Dataflow Architecture}},
number = {MIT/LCS/TR-370},
institution = {Laboratory for Computer Science},
address = {Massachusetts Institute of Technology, Cambridge, MA},
month = {August},
year = {1986},
abstract = {Compilation of the programming language Id Nouveau into machine
code for the MIT tagged-token dataflow architecture is thoroughly described.
Id Nouveau is a higher-order functional language augmented with a novel data
structure facility known as I-Structures. The tagged-token dataflow
architecture is a dataflow computer of the dynamic variety. Compilation takes
place in two steps. In the first step, the Id Nouveau program is converted
into an abstract dataflow graph called a program graph. Program graphs embody
no detailed knowledge of the target architecture, yet have a very precise
operational semantics. At the same time, they represent data and control flow
in a way very convenient for program transformation. Several common optimizing
transformations are discussed. The second step of compilation converts the
program graph into machine code for the tagged-token architecture, taking into
account the machine's finite resources. Peephole optimizations for the machine
code are discussed, and a general-purpose optimization algorithm is given.},
keywords = {Compilation, Dataflow, Id},
}

@inproceedings{Trau89,
author = {Traub, Kenneth R.},
title = {{Compilation as Partitioning: A New Approach to Compiling Non-Strict
Functional Languages}},
booktitle = {FPCA '89, Functional Programming Languages and
Computer Architecture},
address = {London, UK, September 11--13},
year = {1989},
pages = {75--88},
publisher = {ACM Press, New York},
abstract = {In {\em non-strict\/} functional languages, a data structure may
be read before all its components are written, and a function may return a
value before finishing all its computation or even before all its arguments
have been evaluated. Such flexibility gives expressive power to the
programmer, but makes life difficult for the compiler because it may not be
possible to totally order instructions at compile time; the correct order can
very dramatically with the input data. Consequently, the compiler must break
the program into sequential fragments, or {\em threads\/}, whose relative
ordering is determined at run time. Good compilers employ strictness analysis
and other techniques to make threads as large as possible, to minimize run-time
overhead. While partitioning a program into sequential threads is a crucial
issue, existing compilers treat it as a byproduct of applying some other
methodology, such as performing Henderson's force/delay transformations or
generating intermediate code for an abstract graph reduction machine (e.g.,
the G-machine or Tim). In this paper, we present a view of functional language
compilation that takes partitioning a function into sequential threads as the
first order of business. The resulting framework cleanly separates issues of
partitioning, of thread scheduling, of environments, and of data type
representation (including first-class functions). Our method sidesteps both
the force/delay transformation and abstract machines, going directly from
source code to sequential three-address code. Nevertheless, nearly all of the
optimizations proposed for existing approaches are easily expressed. We
consider two non-strict evaluation rules: the familiar lazy evaluation, and
lenient evaluation as found in the language Id. We also consider both
uniprocessor and parallel processor scheduling policies.},
keywords = {Compilation, Thread Partitioning, Scheduling, Lenient Evaluation},
}

@inproceedings{Trau91a,
author = {Traub, Kenneth R.},
title = {{Multi-thread Code Generation for Dataflow Architectures from 
Non-Strict Programs}},
booktitle = {{Functional Programming Languages and Computer Architectures}},
address = {Harvard, Massachusetts, USA},
year = {1991},
series = {Lecture Notes in Computer Science},
volume = {523},
publisher = {Springer, Berlin},
editor = {Hughes, John},
pages = {73--101},
abstract = {This paper presents a new style of code generation for dataflow
architectures, based on a view of such architectures as general multi-threaded
von Neumann machines. Whereas the traditional picture of dataflow object code
consists of tokens flowing along the arcs of a dataflow graph, the
multi-threaded style treats a linear sequence of dataflow instructions as a
sequential thread. Within a thread, data is passed by imperatively reading and
writing an activation frame associated with a procedure invocation, just as in
conventional architectures. Also, advanced dataflow architectures like Monsoon
provide specific support for sequential threads, in the form of general
registers for operand storage within a thread. Between threads, values are
also communicated via the activation frame. The generation and synchronization
of tokens, which in traditional dataflow is the primary means of communicating
data, is relegated haere to a purely control flow role, spawning new threads
and gating their initiation, but themselves carrying no data. Our results show
that in many cases, the multi-threaded style of code generation results in
fewer total machine cycles to executed a given program as compared to the
traditional dataflow style. Surprisingly, this remains true even if the
general registers are not employed. The reason the multi-threaded style is so
successful is that by separating data flow from control flow, redundant
control flow can be eliminated. In other words, fewer forking (token creation)
and joining (token matching) operations are required. Central to eliminating
redundant control flow is the compiler's ability to discover sequential
threads that may be scheduled at compile time. While this is inherently
difficult when starting from a non-strict language such as Id, even small
threads have a beneficial effect. The most dramatic performance improvement
comes about in compiling Id loop, where the language semantics specifies a
greater degree of strictness, resulting in larger threads. We demonstrate the
comparative performance between the multi-thread and traditional dataflow
styles of code generation through empirical observations on the Monsoon
hardware, and also through a detailed analytic examination of the code
generated under each paradigm.},
keywords = {Compilation, Multi-threading, Dataflow},
}

@article{WrFa89,
author = {Wray, S. C. and Fairbairn, J.},
title = {{Non-strict Languages --- Programming and Implementation}},
journal = compj,
volume = {32},
number = {2},
year = {1989},
pages = {142--151},
abstract = {Non-strict evaluation improves the expressive power of functional
languages at the expense of an apparent loss of efficiency. In this paper we
give examples of this expressive power, taking as an example an interactive
functional program and describing the programming techniques depending on
non-strict evaluation which improved its design. Implementation methods for
non-strict languages have delivered poor performance precisely when such
programming techniques have been used. This need not be the case, however, and
in the second part of the paper we describe Tim, a method of implementing
non-strict languages for which the penalty for using lazy evaluation is very
small.},
keywords = {Non-Strictness, Streams, Circular Programs},
}

@inproceedings{Turn81,
author = {Turner, David},
title = {{The Semantic Elegance of Applicative Languages}},
booktitle = {{FPCA '81, Functional Languages and Computer Architecture}},
address = {Portsmouth, New Hampshire, October 18--22},
year = {1981},
pages = {85--92},
publisher = {ACM, New York},
abstract = {In what does the alleged superiority of applicative languages
consist? In the last analysis the answer must be in terms of the reduction in
the time required to produce a correct program to solve a given problem. On
reflection I decided that the best way to demonstrate theis would be to take
some reasonably non-trivial problem and show how, by proceeding within a
certain kind of applicative language framework it was possible to develop a
working solution with a fraction of the effort that would have been necessary
in a conventional imperative language. The particular problem I have chosen
also brings out a number of general points of interest which I shall briefly
discuss afterwards.},
keywords = {Non-strictness, Streams},
}

@techreport{Vree86,
author = {Vree, Willem G.},
title = {{The Grain Size of Parallel Computations in a Functional Program}},
institution = {Computer Science Department},
address = {Dutch Water Board Authority, Rijwijk, The Netherlands},
year = {1986},
abstract = {One problem that limits the exploitability of the available
parallelism in a functional language is that many function applications may
not contain a sufficient amount of computation to justify the overhead that is
caused by data transmission and process synchronization in parallel machine
architectures. This paper concentrates on the grain size problem in relation
to that class of MIMD architectures for which the data-communication and
process-synchronization overrhead is large compared to the elementary machine
instruction time. It will be demonstrated, with the aid of an example program
of moderate size, that it is sometimes necessary to perform complicated
program transformations in order to obtain a useful grain size for MIMD
architectures.},
keywords = {Grain Size},
}

@inproceedings{WaBa90,
author = {Walinsky, Clifford and Banerjee, Deb},
title = {{A Functional Programming Language Compiler for Massively
Parallel Computers}},
booktitle = {{ACM Symposium on Lisp and Functional Programming}},
address = {Nice, France, June 27--29},
year = {1990},
pages = {131--138},
publisher = {ACM Press, New York},
abstract = {Functional programming languages remove programmers from low-level
machine details, an important achievement when programming massively parallel
systems. We present an overview of an FP compiler that generates programs
capable of exploiting data-parallelism, a view of parallelism where distinct
data elements reside on distinct processors and all processors execute a
single instruction stream. To achieve this form of parallelism, FP's sequences
are represented as arrays. This representation makes possible optimization
techniques developed for APL compilers that compose routing functions at
compile-time. These techniques are described succinctly by a set of axioms and
inference rules. We demonstrate the optimizations by compiling several FP
functions, obtaining optimal performance.},
keywords = {FP, Data Parallelism, SIMD},
}

@inproceedings{Wadl86,
author = {Wadler, Philip},
title = {{A New Array Operation}},
booktitle = {{Graph Reduction, Proceedings of a Workshop}},
address = {Santa Fe, New Mexico, September 29 -- October 1},
year = {1986},
pages = {328--335},
volume = {274},
series = {Lecture Notes in Computer Science},
editor = {Fasel, Joseph H. and Keller, Robert M.},
publisher = {Springer, Berlin},
abstract = {Providing efficient operations on arrays in a functional language
is problematic. No completely satisfactory solution has yet been found. This
paper proposes a new solution, which is a variant on the ``monolithic''
approach to array operations. The new solution is also not completely
satisfactory, but does does have advantages complementary to other proposals.
It makes a class of programs easy to express, notably those involving the
construction of histograms. It also allows for parallel implementations
without the need to introduce non-determinism.},
keywords = {Updatable Arrays, Non-determinsim},
}

@inproceedings{WaHu87,
author = {Wadler, Philip and Hughes, R. J. M.},
title = {{Projections for Strictness Analysis}},
booktitle = {{Functional Programming Languages and Computer Architecture}},
editor = {Kahn, Gilles},
address = {Portland, Oregon, USA, September 14--16},
year = {1987},
pages = {385--407},
series = {Lecture Notes in Computer Science},
volume = {274},
publisher = {Springer, Berlin},
abstract = {Contexts have been proposed as a means of performing strictness
analysis on non-flat domains. Roughly speaking, a {\em context\/} describes
how much a sub-expression can be represented using the notion of {\em
projection} from domain theory. This is clearer than the previous explanation
of contexts in terms of continuations. In addition, this paper describes {\em
finite domains\/} of contexts over the non-flat list domain. This means that
recursive context equations can be solved using standard fixpoint techniques,
instead of the algebraic manipulation previously used.},
keywords = {Strictness Analysis, Projections},
}

@inproceedings{WaWa87,
author = {Watson, Paul and Watson, Ian},
title = {{Evaluating Functional Programs on the FLAGSHIP Machine}},
booktitle = {FPCA '87, Functional Programming Language and Computer
Architecture}, 
address = {Portland, Oregon, USA, September 14--16},
year = {1987},
editor = {Kahn, Gilles},
publisher = {Springer, Berlin},
volume = {274},
series = {Lecture Notes in Computer Science},
pages = {80--97},
abstract = {The Flagship project has the aim of designing a parallel computer
system for the evaluation of declarative languages. The physical architecture
of the machine consists of a set of closely coupled processor/store pairs
communicating over a high performance communications network. A functional
program to be evaluated is compiled into a Super-Combinator expression graph
which is then simplified by graph reduction. The paper discusses the Flagship
machine architecture and describes in detail the computational model which
defines how a functional program is represented and reduced. The issues
underlying the design of the main features of the computational models are
examined.}, 
keywords = {Parallel Graph Reduction},
}

@techreport{Webe89,
author = {Weber, Ken},
title = {{dLisp --- A LAN-Distributed Lisp System}},
institution = {Department of Mathematical Sciences},
address = {Kent State University},
month = {February},
year = {1989},
abstract = {dLisp, a package of functions that can be added to existing Lisp
systems to provide for distributed evaluation of Lisp forms on a local area
network, is described and defined. A few programming examples are given and a
prototype implementation is discussed briefly.},
keywords = {Lisp, Networks},
}

@phdthesis{Youn89,
author = {Young, Jonathan Hood},
title = {{The Theory and Practice of Semantic Program Analysis for Higher-Order
Functional Programming Languages}},
school = {Department of Computer Science},
address = {Yale University},
month = {May},
year = {1989},
type = {Research Report YALEU/DCS/RR-669},
abstract = {This thesis describes the implementation of an optimizing compiler
for the language ALFL. Like many other lazy languages, a significant overhead
is incurred in order to preserve the lazy semantics of the language. The
optimizations developed in this thesis are designed to eliminate this overhead
in as many places as possible by converting lazy evaluation (call-by-need) to
eager evaluation (call-by-value). Converting call-by-need to call-by-value is
not always a safe transformation; it must be proven to be safe at each
opportunity where it arises in the source program. The safety of the
transformations in this thesis is proven correct using the technique of {\em
abstract interpretation:\/} we show that certain non-standard semantic
analyses stand in a particular relation to the standard semantics of our
language. The semantic analysis developed in this thesis include {\em
strictness analysis, termination analysis, operation count analysis,\/} and
{\em collecting interpretations.\/} We then discuss several practical issues
related to the integration of these semantic analyses into a compiler. Several
new and old techniques to compute or approximate least fixpoints of semantic
equations are presented. Finally, we document the semantics-based tools used
to build the compiler, including packages for caching the results of
functions, manipulating sets, and creating and updating environments, as well
as a highly abstract and parameterized semantic evaluator which implements the
fixpoint finding techniques mentioned above.},
keywords = {Compilation, Abstract Interpretation, Strictness Analysis,
Operation Count Analysis, Collecting Interpretations, Fixed Points},
}

@techreport{ZHH*87,
author = {Zorn, Benjamin and Hilfinger, Paul and Ho, Kinson and        
Larus, James},                      
title = {{SPUR Lisp: Design and Implementation}},
number = {UCB/CSD 87/373},
institution = {Computer Science Division (EECS)},
address = {University of California, Berkeley, CA},
year = {1987},
month = {September},
abstract = {This document describes SPUR Lisp, a Common Lisp superset designed
and implemented at U.C.\ Berkeley. Function calling sequences, system data
structures, memory management policies, etc.\ are all described in detail.
Reasons for the more important decisions are given. SPUR Lisp is implemented
on BARB, a software simulator for SPUR hardware. In addition to describing the
design of SPUR Lisp, this paper provides documentation for the BARB simulator,
the SPUR Lisp compiler, and associated tools.},
keywords = {Lisp, Futures},
}

@techreport{ZHH*88,
author = {Zorn, Benjamin and Hilfinger, Paul and Ho, Kinson and        
Larus, James and Semenzato, Luigi}, 
title = {{Features for Multiprocessing in SPUR Lisp}},
number = {UCB/CSD 88/406},
institution = {Computer Science Division (EECS)},
address = {University of California, Berkeley, CA},
year = {1988},
month = {March},
abstract = {This paper describes simple extensions to Common Lisp for
concurrent computation on multiprocessors. Functions for process creation,
communication, and synchronization are described. Multiple threads of control
are created with {\em process\/} objects. Communication and synchronization
are managed using {\em mailboxes\/}. {\em Signals\/} provide asynchronous
communication between processes. SPUR Lisp includes {\em future\/} and {\em
delay\/} values, which were first introduced in Multilisp. These features
provide a flexible and efficient basis on which higher-level multiprocessing
abstractions can be implemented and studied.},
keywords = {Lisp, Futures},
}

@article{ZHL*89,
author = {Zorn, Benjamin and Ho, Kinson and Larus, James and 
Semenzato, Luigi and Hilfinger, Paul},
title = {{Multiprocessing Extensions in Spur Lisp}},
journal = {IEEE Software},
year = {1989},
pages = {41--49},
month = {July},
abstract = {Spur Lisp is a supreset of Common Lisp that executes on the Spur
multiprocessor workstation developed at the University of California at
Berkeley. In this article, we present Spur Lisp's multiprocessing
extensions.}, 
keywords = {Lisp, Futures},
}