Paolo Missier
Publications
BibBase http://homepages.cs.ncl.ac.uk/paolo.missier/Paolo-public-with-urlpaper.bib
generated by
2019
(7)
Why-Diff: Exploiting Provenance to Understand Outcome Differences from non-identical Reproduced Workflows.
Thavasimani, P.; Cala, J.; and Missier, P.
IEEE Access,1–1. 2019.
Paper
doi
link
bibtex
abstract
5 downloads
@article{8662612,
abstract = {Data analytics processes such as scientific workflows tend to be executed repeatedly, with varying dependencies and input datasets. The case has been made in the past for tracking the provenance of the final information products through the workflow steps, to enable their reproducibility. In this work, we explore the hypothesis that provenance traces recorded during execution are also instrumental to answering questions about the observed differences between sets of results obtained from similar but not identical workflow configurations. Such differences in configurations may be introduced deliberately, i.e., to explore process variations, or accidentally, typically as the result of porting efforts or of changes in the computing environment. Using a commonly used workflow programming model as a reference, we consider both structural variations in the workflows as well as variations within their individual components. Our whydiff algorithm compares the graph representations of two provenance traces derived from two workflow variations. It produces a delta graph that can be used to produce human-readable explanations of the impact of workflow differences on observed output differences. We report on our Neo4j graph database. We also report explanations of difference between workflow results using a suite of synthetic workflows as well as real-world workflows.},
author = {Thavasimani, Priyaa and Cala, Jacek and Missier, Paolo},
doi = {10.1109/ACCESS.2019.2903727},
issn = {2169-3536},
journal = {IEEE Access},
keywords = {Alzheimer's disease,Big Data,Databases,Genetics,Libraries,Provenance,Reproducibility,Sentiment analysis,Software,Why-Diff,Workflow,eScience Central},
pages = {1--1},
title = {{Why-Diff: Exploiting Provenance to Understand Outcome Differences from non-identical Reproduced Workflows}},
url = {https://ieeexplore.ieee.org/document/8662612/},
year = {2019}
}
Data analytics processes such as scientific workflows tend to be executed repeatedly, with varying dependencies and input datasets. The case has been made in the past for tracking the provenance of the final information products through the workflow steps, to enable their reproducibility. In this work, we explore the hypothesis that provenance traces recorded during execution are also instrumental to answering questions about the observed differences between sets of results obtained from similar but not identical workflow configurations. Such differences in configurations may be introduced deliberately, i.e., to explore process variations, or accidentally, typically as the result of porting efforts or of changes in the computing environment. Using a commonly used workflow programming model as a reference, we consider both structural variations in the workflows as well as variations within their individual components. Our whydiff algorithm compares the graph representations of two provenance traces derived from two workflow variations. It produces a delta graph that can be used to produce human-readable explanations of the impact of workflow differences on observed output differences. We report on our Neo4j graph database. We also report explanations of difference between workflow results using a suite of synthetic workflows as well as real-world workflows.
Increasing phenotypic annotation improves the diagnostic rate of exome sequencing in a rare neuromuscular disorder.
Thompson, R.; Papakonstantinou Ntalis, A.; Beltran, S.; Tapf, A.; de Paula Estephan, E.; Polavarapu, K.; ’t Hoen, P. A. C.; Missier, P.; and Lochmuller, H.
Human Mutation. 2019.
Paper
doi
link
bibtex
abstract
@article{doi:10.1002/humu.23792,
title = {Increasing phenotypic annotation improves the diagnostic rate of exome sequencing in a rare neuromuscular disorder},
author = {Thompson, Rachel and Papakonstantinou Ntalis, Anastasios and Beltran, Sergi and Tapf, Ana and de Paula Estephan, Eduardo and Polavarapu, Kiran and ’t Hoen, Peter A. C. and Missier, Paolo and Lochmuller, Hanns},
year={2019},
journal = {Human Mutation},
keywords = {congenital myasthenic syndromes, deep phenotyping, diagnosis, exome sequencing, Exomiser, human phenotype ontology, variant prioritization},
doi = {10.1002/humu.23792},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/humu.23792},
eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/humu.23792},
abstract = {Abstract Phenotype-based filtering and prioritization contribute to the interpretation of genetic variants detected in exome sequencing. However, it is currently unclear how extensive this phenotypic annotation should be. In this study, we compare methods for incorporating phenotype into the interpretation process and assess the extent to which phenotypic annotation aids prioritization of the correct variant. Using a cohort of 29 patients with congenital myasthenic syndromes with causative variants in known or newly discovered disease genes, exome data and the Human Phenotype Ontology (HPO)-coded phenotypic profiles, we show that gene-list filters created from phenotypic annotations perform similarly to curated disease-gene virtual panels. We use Exomiser, a prioritization tool incorporating phenotypic comparisons, to rank candidate variants while varying phenotypic annotation. Analyzing 3,712 combinations, we show that increasing phenotypic annotation improved prioritization of the causative variant, from 62\% ranked first on variant alone to 90\% with seven HPO annotations. We conclude that any HPO-based phenotypic annotation aids variant discovery and that annotation with over five terms is recommended in our context. Although focused on a constrained cohort, this provides real-world validation of the utility of phenotypic annotation for variant prioritization. Further research is needed to extend this concept to other diseases and more diverse cohorts.},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/PhenotypicannotationHumanMutation2019.pdf}
}
Abstract Phenotype-based filtering and prioritization contribute to the interpretation of genetic variants detected in exome sequencing. However, it is currently unclear how extensive this phenotypic annotation should be. In this study, we compare methods for incorporating phenotype into the interpretation process and assess the extent to which phenotypic annotation aids prioritization of the correct variant. Using a cohort of 29 patients with congenital myasthenic syndromes with causative variants in known or newly discovered disease genes, exome data and the Human Phenotype Ontology (HPO)-coded phenotypic profiles, we show that gene-list filters created from phenotypic annotations perform similarly to curated disease-gene virtual panels. We use Exomiser, a prioritization tool incorporating phenotypic comparisons, to rank candidate variants while varying phenotypic annotation. Analyzing 3,712 combinations, we show that increasing phenotypic annotation improved prioritization of the causative variant, from 62% ranked first on variant alone to 90% with seven HPO annotations. We conclude that any HPO-based phenotypic annotation aids variant discovery and that annotation with over five terms is recommended in our context. Although focused on a constrained cohort, this provides real-world validation of the utility of phenotypic annotation for variant prioritization. Further research is needed to extend this concept to other diseases and more diverse cohorts.
Parametrised Data Sampling for Fairness Optimisation.
Gonz�lez Zelaya, C. V.; Missier, P.; and Prangle, D.
In Proceedings of Explainable AI for Fairness, Accountability & Transparency Workshop (KDD XAI), 2019. ACM
Paper
link
bibtex
@inproceedings{zelaya2019correction,
title = {Parametrised Data Sampling for Fairness Optimisation},
author={Gonz�lez Zelaya, Carlos Vladimiro and Missier, Paolo and Prangle, Dennis},
booktitle={Proceedings of Explainable AI for Fairness, Accountability \& Transparency Workshop (KDD XAI)},
year={2019},
organization={ACM},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/kddSubmission.pdf}
}
Toward a Decentralized, Trust-less Marketplace for Brokered IoT Data Trading using Blockchain.
Bajoudah, S.; Changyu, D.; and Missier, P.
In Procs. 2nd IEEE International Conference on Blockchain (Blockchain 2019), Atlanta, USA, 2019. IEEE
Paper
link
bibtex
@inproceedings{Bajoudah2019,
address = {Atlanta, USA},
author = {Bajoudah, Shaimaa and Changyu, Dong and Missier, Paolo},
booktitle = {Procs. 2nd IEEE International Conference on Blockchain (Blockchain 2019)},
publisher = {IEEE},
title = {{Toward a Decentralized, Trust-less Marketplace for Brokered IoT Data Trading using Blockchain}},
year = {2019},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/Decentralised_Marketplace_USA_Conference___Accepted_Version_.pdf}
}
Efficient Re-computation of Big Data Analytics Processes in the Presence of Changes: Computational Framework, Reference Architecture, and Applications.
Missier, P.; and Cala, J.
In Procs. IEEE Big Data Congress, Milano, Italy, 2019. IEEE
Paper
link
bibtex
@inproceedings{Missier2019,
address = {Milano, Italy},
author = {Missier, Paolo and Cala, Jacek},
booktitle = {Procs. IEEE Big Data Congress},
keywords = {{\#}provenance,{\#}re-computation,{\#}workflow},
mendeley-tags = {{\#}re-computation,{\#}workflow,{\#}provenance},
publisher = {IEEE},
title = {{Efficient Re-computation of Big Data Analytics Processes in the Presence of Changes: Computational Framework, Reference Architecture, and Applications}},
year = {2019},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/PID5953159.pdf}
}
Targeted therapies for congenital myasthenic syndromes: systematic review and steps towards a treatabolome.
Thompson, R.; Bonne, G.; Missier, P.; and Lochmüller, H.
Emerging Topics in Life Sciences,ETLS20180100. jan 2019.
Paper
doi
link
bibtex
abstract
@article{Thompson2019,
abstract = {Despite recent scientific advances, most rare genetic diseases ? including most neuromuscular diseases ? do not currently have curative gene-based therapies available. However, in some cases, such as vitamin, cofactor or enzyme deficiencies, channelopathies and disorders of the neuromuscular junction, a confirmed genetic diagnosis provides guidance on treatment, with drugs available that may significantly alter the disease course, improve functional ability and extend life expectancy. Nevertheless, many treatable patients remain undiagnosed or do not receive treatment even after genetic diagnosis. The growth of computer-aided genetic analysis systems that enable clinicians to diagnose their undiagnosed patients has not yet been matched by genetics-based decision-support systems for treatment guidance. Generating a ?treatabolome' of treatable variants and the evidence for the treatment has the potential to increase treatment rates for treatable conditions. Here, we use the congenital myasthenic syndromes (CMS), a group of clinically and genetically heterogeneous but frequently treatable neuromuscular conditions, to illustrate the steps in the creation of a treatabolome for rare inherited diseases. We perform a systematic review of the evidence for pharmacological treatment of each CMS type, gathering evidence from 207 studies of over 1000 patients and stratifying by genetic defect, as treatment varies depending on the underlying cause. We assess the strength and quality of the evidence and create a dataset that provides the foundation for a computer-aided system to enable clinicians to gain easier access to information about treatable variants and the evidence they need to consider.3,4-DAP, 3,4-diaminopyridine; AChE, acetylcholinesterase; AChR, acetylcholine receptor; CEBM, Centre for evidence-based medicine; CMS, congenital myasthenic syndrome; NGS, next-generation sequencing; NMJ, neuromuscular junction},
author = {Thompson, Rachel and Bonne, Gis{\`{e}}le and Missier, Paolo and Lochm{\"{u}}ller, Hanns},
doi = {10.1042/ETLS20180100},
journal = {Emerging Topics in Life Sciences},
month = {jan},
pages = {ETLS20180100},
title = {{Targeted therapies for congenital myasthenic syndromes: systematic review and steps towards a treatabolome}},
url = {http://www.emergtoplifesci.org/content/early/2019/01/25/ETLS20180100.abstract},
year = {2019}
}
Despite recent scientific advances, most rare genetic diseases ? including most neuromuscular diseases ? do not currently have curative gene-based therapies available. However, in some cases, such as vitamin, cofactor or enzyme deficiencies, channelopathies and disorders of the neuromuscular junction, a confirmed genetic diagnosis provides guidance on treatment, with drugs available that may significantly alter the disease course, improve functional ability and extend life expectancy. Nevertheless, many treatable patients remain undiagnosed or do not receive treatment even after genetic diagnosis. The growth of computer-aided genetic analysis systems that enable clinicians to diagnose their undiagnosed patients has not yet been matched by genetics-based decision-support systems for treatment guidance. Generating a ?treatabolome' of treatable variants and the evidence for the treatment has the potential to increase treatment rates for treatable conditions. Here, we use the congenital myasthenic syndromes (CMS), a group of clinically and genetically heterogeneous but frequently treatable neuromuscular conditions, to illustrate the steps in the creation of a treatabolome for rare inherited diseases. We perform a systematic review of the evidence for pharmacological treatment of each CMS type, gathering evidence from 207 studies of over 1000 patients and stratifying by genetic defect, as treatment varies depending on the underlying cause. We assess the strength and quality of the evidence and create a dataset that provides the foundation for a computer-aided system to enable clinicians to gain easier access to information about treatable variants and the evidence they need to consider.3,4-DAP, 3,4-diaminopyridine; AChE, acetylcholinesterase; AChR, acetylcholine receptor; CEBM, Centre for evidence-based medicine; CMS, congenital myasthenic syndrome; NGS, next-generation sequencing; NMJ, neuromuscular junction
2018
(8)
Selective and Recurring Re-computation of Big Data Analytics Tasks: Insights from a Genomics Case Study.
Cal�a, J.; and Missier, P.
Big Data Research, 13: 76 - 94. 2018.
Big Medical/Healthcare Data Analytics
Paper
doi
link
bibtex
abstract
1 download
@article{CALA201876,
title = "Selective and Recurring Re-computation of Big Data Analytics Tasks: Insights from a Genomics Case Study",
journal = "Big Data Research",
volume = "13",
pages = "76 - 94",
year = "2018",
note = "Big Medical/Healthcare Data Analytics",
issn = "2214-5796",
doi = "https://doi.org/10.1016/j.bdr.2018.06.001",
url = "http://www.sciencedirect.com/science/article/pii/S2214579617303520",
author = "Jacek Cal�a and Paolo Missier",
keywords = "Re-computation, Knowledge decay, Big data analysis, Genomics",
abstract = "The value of knowledge assets generated by analytics processes using Data Science techniques tends to decay over time, as a consequence of changes in the elements the process depends on: external data sources, libraries, and system dependencies. For large-scale problems, refreshing those outcomes through greedy re-computation is both expensive and inefficient, as some changes have limited impact. In this paper we address the problem of refreshing past process outcomes selectively, that is, by trying to identify the subset of outcomes that will have been affected by a change, and by only re-executing fragments of the original process. We propose a technical approach to address the selective re-computation problem by combining multiple techniques, and present an extensive experimental study in Genomics, namely variant calling and their clinical interpretation, to show its effectiveness. In this case study, we are able to decrease the number of required re-computations on a cohort of individuals from 495 (blind) down to 71, and that we can reduce runtime by at least 60\% relative to the naïve blind approach, and in some cases by 90\%. Starting from this experience, we then propose a blueprint for a generic re-computation meta-process that makes use of process history metadata to make informed decisions about selective re-computations in reaction to a variety of changes in the data."
}
The value of knowledge assets generated by analytics processes using Data Science techniques tends to decay over time, as a consequence of changes in the elements the process depends on: external data sources, libraries, and system dependencies. For large-scale problems, refreshing those outcomes through greedy re-computation is both expensive and inefficient, as some changes have limited impact. In this paper we address the problem of refreshing past process outcomes selectively, that is, by trying to identify the subset of outcomes that will have been affected by a change, and by only re-executing fragments of the original process. We propose a technical approach to address the selective re-computation problem by combining multiple techniques, and present an extensive experimental study in Genomics, namely variant calling and their clinical interpretation, to show its effectiveness. In this case study, we are able to decrease the number of required re-computations on a cohort of individuals from 495 (blind) down to 71, and that we can reduce runtime by at least 60% relative to the naïve blind approach, and in some cases by 90%. Starting from this experience, we then propose a blueprint for a generic re-computation meta-process that makes use of process history metadata to make informed decisions about selective re-computations in reaction to a variety of changes in the data.
Versioned-PROV: A PROV extension to support mutable data entities.
Pimentel, J. F.; Missier, P.; Murta, L.; and Braganholo, V.
In Procs. IPAW 2018, London, 2018. Springer
link
bibtex
@inproceedings{PMMB18,
address = {London},
author = {Pimentel, Joao Felipe and Missier, Paolo and Murta, Leonardo and Braganholo, Vanessa},
booktitle = {Procs. IPAW 2018},
keywords = {{\#}provenance,{\#}recomputation,process re-computation,provenance annotations},
mendeley-tags = {{\#}recomputation,{\#}provenance},
publisher = {Springer},
title = {{Versioned-PROV: A PROV extension to support mutable data entities}},
year = {2018}
}
Editorial: Special Issue on Improving the Veracity and Value of Big Data.
Geerts, F.; Missier, P.; and Paton, N. W.
J. Data and Information Quality, 9(3): 13:1–13:2. 2018.
Paper
doi
link
bibtex
@article{DBLP:journals/jdiq/GeertsMP18,
author = {Floris Geerts and
Paolo Missier and
Norman W. Paton},
title = {Editorial: Special Issue on Improving the Veracity and Value of Big
Data},
journal = {J. Data and Information Quality},
volume = {9},
number = {3},
pages = {13:1--13:2},
year = {2018},
url = {http://doi.acm.org/10.1145/3174791},
doi = {10.1145/3174791},
timestamp = {Wed, 28 Mar 2018 12:00:19 +0200},
biburl = {https://dblp.org/rec/bib/journals/jdiq/GeertsMP18},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Provenance Annotation and Analysis to Support Process Re-Computation.
Cala, J.; and Missier, P.
In Procs. IPAW 2018, London, 2018. Springer
Paper
link
bibtex
abstract
@inproceedings{Cala2018,
abstract = {Many resource-intensive analytics processes evolve over time following new versions of the reference datasets and software dependen- cies they use. We focus on scenarios in which any version change has the potential to affect many outcomes, as is the case for instance in high throughput genomics where the same process is used to analyse large cohorts of patient genomes, or cases. As any version change is unlikely to affect the entire population, an efficient strategy for restoring the cur- rency of the outcomes requires first to identify the scope of a change, i.e., the subset of affected data products. In this paper we describe a generic and reusable provenance-based approach to address this scope discovery problem. It applies to a scenario where the process consists of complex hierarchical components, where different input cases are processed using different version configurations of each component, and where separate provenance traces are collected for the executions of each of the com- ponents. We show how a new data structure, called a restart tree, is computed and exploited to manage the change scope discovery problem.},
address = {London},
author = {Cala, Jacek and Missier, Paolo},
booktitle = {Procs. IPAW 2018},
keywords = {{\#}provenance,{\#}recomputation,process re-computation,provenance annotations},
mendeley-tags = {{\#}recomputation,{\#}provenance},
publisher = {Springer},
title = {{Provenance Annotation and Analysis to Support Process Re-Computation}},
year = {2018},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/recomp-core-prov.pdf}
}
Many resource-intensive analytics processes evolve over time following new versions of the reference datasets and software dependen- cies they use. We focus on scenarios in which any version change has the potential to affect many outcomes, as is the case for instance in high throughput genomics where the same process is used to analyse large cohorts of patient genomes, or cases. As any version change is unlikely to affect the entire population, an efficient strategy for restoring the cur- rency of the outcomes requires first to identify the scope of a change, i.e., the subset of affected data products. In this paper we describe a generic and reusable provenance-based approach to address this scope discovery problem. It applies to a scenario where the process consists of complex hierarchical components, where different input cases are processed using different version configurations of each component, and where separate provenance traces are collected for the executions of each of the com- ponents. We show how a new data structure, called a restart tree, is computed and exploited to manage the change scope discovery problem.
Analyzing Social Network Images with Deep Learning Models to Fight Zika Virus.
Barros, H. P.; Lima, B. G. C.; Crispim, F. C.; Vieira, T.; Missier, P.; and Fonseca, B.
In Procs. 15th International Conference on Image Analysis and Recognition (ICIAR'18), 2018.
Paper
link
bibtex
@inproceedings{Barros2018,
author = {Barros, H. Pedro and Lima, Bruno G. C. and Crispim, Felipe C. and Vieira, Tiago and Missier, Paolo and Fonseca, Baldoino},
booktitle = {Procs. 15th International Conference on Image Analysis and Recognition (ICIAR'18)},
keywords = {{\#}zika},
mendeley-tags = {{\#}zika},
title = {{Analyzing Social Network Images with Deep Learning Models to Fight Zika Virus}},
year = {2018},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/analyzing-social-network.pdf}
}
Loom: Query-aware Partitioning of Online Graphs.
Firth, H; and Missier, P
In Procs. 21st International Conference on Extending Database Technology (EDBT), Vienna, Austria, 2018. EDBT
Paper
link
bibtex
abstract
@inproceedings{Firth2018,
abstract = {As with general graph processing systems, partitioning data over a cluster of machines improves the scalability of graph database management systems. However, these systems will incur additional network cost during the execution of a query workload, due to inter- partition traversals. Workload-agnostic partitioning algorithms typically minimise the likelihood of any edge crossing partition boundaries. However, these partitioners are sub-optimal with re- spect to many workloads, especially queries, which may require more frequent traversal of speci c subsets of inter-partition edges. Furthermore, they largely unsuited to operating incrementally on dynamic, growing graphs. We present a new graph partitioning algorithm, Loom, that op- erates on a stream of graph updates and continuously allocates the new vertices and edges to partitions, taking into account a query workload of graph pattern expressions along with their relative frequencies. First we capture the most common patterns of edge traversals which occur when executing queries. We then compare sub-graphs, which present themselves incrementally in the graph update stream, against these common patterns. Finally we attempt to allocate each match to single partitions, reducing the number of inter-partition edges within frequently traversed sub-graphs and improving average query performance. Loom is extensively evaluated over several large test graphs with realistic query workloads and various orderings of the graph updates. We demonstrate that, given a workload, our prototype produces partitionings of signi cantly better quality than existing streaming graph partitioning algorithms Fennel {\&} LDG.},
address = {Vienna, Austria},
author = {Firth, H and Missier, P},
booktitle = {Procs. 21st International Conference on Extending Database Technology (EDBT)},
keywords = {distributed graphs,graph partitioning},
publisher = {EDBT},
title = {{Loom: Query-aware Partitioning of Online Graphs}},
url = {http://edbticdt2018.at/},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/1711.06608.pdf},
year = {2018}
}
As with general graph processing systems, partitioning data over a cluster of machines improves the scalability of graph database management systems. However, these systems will incur additional network cost during the execution of a query workload, due to inter- partition traversals. Workload-agnostic partitioning algorithms typically minimise the likelihood of any edge crossing partition boundaries. However, these partitioners are sub-optimal with re- spect to many workloads, especially queries, which may require more frequent traversal of speci c subsets of inter-partition edges. Furthermore, they largely unsuited to operating incrementally on dynamic, growing graphs. We present a new graph partitioning algorithm, Loom, that op- erates on a stream of graph updates and continuously allocates the new vertices and edges to partitions, taking into account a query workload of graph pattern expressions along with their relative frequencies. First we capture the most common patterns of edge traversals which occur when executing queries. We then compare sub-graphs, which present themselves incrementally in the graph update stream, against these common patterns. Finally we attempt to allocate each match to single partitions, reducing the number of inter-partition edges within frequently traversed sub-graphs and improving average query performance. Loom is extensively evaluated over several large test graphs with realistic query workloads and various orderings of the graph updates. We demonstrate that, given a workload, our prototype produces partitionings of signi cantly better quality than existing streaming graph partitioning algorithms Fennel & LDG.
2017
(8)
Adaptive Incremental Learning for Statistical Relational Models Using Gradient-Based Boosting.
Gu, Y.; and Missier, P.
In Procs. ILP '17, 27th International Conference on Inductive Logic Programming (late-breaking paper), Orleans, France, 2017. CEUR-WS
Paper
link
bibtex
abstract
@inproceedings{Gu2017,
abstract = {We consider the problem of incrementally learning models from relational data. Most existing learning methods for statistical relational models use batch learning, which becomes computationally expensive and eventually infeasible for large datasets. The majority of the previous work in relational incremental learning assumes the model's structure is given and only the model's parameters needed to be learned. In this paper, we propose algorithms that can incrementally learn the model's parameters and structure simultaneously. These algorithms are based on the successful formalisation of the relational functional gradient boosting system (RFGB), and extend the classical propositional ensemble methods to relational learning for handling evolving data streams.},
address = {Orleans, France},
author = {Gu, Yulong and Missier, Paolo},
booktitle = {Procs. ILP '17, 27th International Conference on Inductive Logic Programming (late-breaking paper)},
publisher = {CEUR-WS},
title = {{Adaptive Incremental Learning for Statistical Relational Models Using Gradient-Based Boosting}},
url = {https://ilp2017.sciencesconf.org/data/pages/ILP{\_}2017{\_}paper{\_}27.pdf},
year = {2017},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/ILP_2017_paper_27.pdf}
}
We consider the problem of incrementally learning models from relational data. Most existing learning methods for statistical relational models use batch learning, which becomes computationally expensive and eventually infeasible for large datasets. The majority of the previous work in relational incremental learning assumes the model's structure is given and only the model's parameters needed to be learned. In this paper, we propose algorithms that can incrementally learn the model's parameters and structure simultaneously. These algorithms are based on the successful formalisation of the relational functional gradient boosting system (RFGB), and extend the classical propositional ensemble methods to relational learning for handling evolving data streams.
Mind My Value: a Decentralized Infrastructure for Fair and Trusted IoT Data Trading.
Missier, P.; Bajoudah, S.; Capossele, A.; Gaglione, A.; and Nati, M.
In Procs. 7th International Conference on the Internet of Things, Linz,Austria, 2017.
Paper
link
bibtex
abstract
@inproceedings{Missier2017d,
abstract = {Internet of Things (IoT) data are increasingly viewed as a new form of massively distributed and large scale digital assets, which are continuously generated by millions of connected devices. The real value of such assets can only be realized by allowing IoT data trading to occur on a marketplace that rewards every single producer and consumer, at a very granular level. Crucially, we believe that such a marketplace should not be owned by anybody, and should instead fairly and transparently self-enforce a well defined set of governance rules. In this paper we address some of the technical challenges involved in realizing such a marketplace. We leverage emerging blockchain technologies to build a decentralized, trusted, transparent and open architecture for IoT traffic metering and contract compliance, on top of the largely adopted IoT brokered data infrastructure. We discuss an Ethereum-based prototype implementation and experimentally evaluate the overhead cost associated with Smart Contract transactions, concluding that a viable business model can indeed be associated with our technical approach.},
address = {Linz,Austria},
author = {Missier, Paolo and Bajoudah, Shaimaa and Capossele, Angelo and Gaglione, Andrea and Nati, Michele},
booktitle = {Procs. 7th International Conference on the Internet of Things},
file = {:Users/npm65/Documents/Newcastle/CURRENT/DECatapult/IoT/IoT-prov-prototype/dissemination/IOT-conf/iot-conf.pdf:pdf},
keywords = {{\#}IoT,{\#}marketplace},
mendeley-tags = {{\#}IoT,{\#}marketplace},
title = {{Mind My Value: a Decentralized Infrastructure for Fair and Trusted IoT Data Trading}},
url = {http://iot-conference.org/iot2017/},
year = {2017},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/iot-conf.pdf}
}
Internet of Things (IoT) data are increasingly viewed as a new form of massively distributed and large scale digital assets, which are continuously generated by millions of connected devices. The real value of such assets can only be realized by allowing IoT data trading to occur on a marketplace that rewards every single producer and consumer, at a very granular level. Crucially, we believe that such a marketplace should not be owned by anybody, and should instead fairly and transparently self-enforce a well defined set of governance rules. In this paper we address some of the technical challenges involved in realizing such a marketplace. We leverage emerging blockchain technologies to build a decentralized, trusted, transparent and open architecture for IoT traffic metering and contract compliance, on top of the largely adopted IoT brokered data infrastructure. We discuss an Ethereum-based prototype implementation and experimentally evaluate the overhead cost associated with Smart Contract transactions, concluding that a viable business model can indeed be associated with our technical approach.
Preserving the value of large scale data analytics over time through selective re-computation.
Missier, P.; Cala, J.; and Rathi, M.
In Procs. 31st British International Conference on Databases - BICOD, 2017.
Paper
link
bibtex
@inproceedings{Missier2017c,
author = {Missier, Paolo and Cala, Jacek and Rathi, Manisha},
booktitle = {Procs. 31st British International Conference on Databases - BICOD},
title = {{Preserving the value of large scale data analytics over time through selective re-computation}},
year = {2017},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/RecompVision.pdf}
}
.
Missier, P.
Provenance Standards, pages 1–8. Liu, L.; and Özsu, M T., editor(s). Springer New York, New York, NY, 2017.
Paper
doi
link
bibtex
@inbook{Missier2017,
address = {New York, NY},
author = {Missier, Paolo},
booktitle = {Encyclopedia of Database Systems},
doi = {10.1007/978-1-4899-7993-3_80749-1},
editor = {Liu, Ling and {\"{O}}zsu, M Tamer},
isbn = {978-1-4899-7993-3},
pages = {1--8},
publisher = {Springer New York},
title = {{Provenance Standards}},
url = {https://doi.org/10.1007/978-1-4899-7993-3{\_}80749-1},
year = {2017},
urlpaper="http://homepages.cs.ncl.ac.uk/paolo.missier/doc/Provenance-standards.pdf"
}
TAPER: query-aware, partition-enhancement for large, heterogenous graphs.
Firth, H.; and Missier, P.
Distributed and Parallel Databases,1–31. 2017.
Paper
doi
link
bibtex
abstract
@Article{Firth2017,
author="Firth, Hugo
and Missier, Paolo",
title="TAPER: query-aware, partition-enhancement for large, heterogenous graphs",
journal="Distributed and Parallel Databases",
year="2017",
pages="1--31",
abstract="Graph partitioning has long been seen as a viable approach to addressing Graph DBMS scalability. A partitioning, however, may introduce extra query processing latency unless it is sensitive to a specific query workload, and optimised to minimise inter-partition traversals for that workload. Additionally, it should also be possible to incrementally adjust the partitioning in reaction to changes in the graph topology, the query workload, or both. Because of their complexity, current partitioning algorithms fall short of one or both of these requirements, as they are designed for offline use and as one-off operations. The TAPER system aims to address both requirements, whilst leveraging existing partitioning algorithms. TAPER takes any given initial partitioning as a starting point, and iteratively adjusts it by swapping chosen vertices across partitions, heuristically reducing the probability of inter-partition traversals for a given path queries workload. Iterations are inexpensive thanks to time and space optimisations in the underlying support data structures. We evaluate TAPER on two different large test graphs and over realistic query workloads. Our results indicate that, given a hash-based partitioning, TAPER reduces the number of inter-partition traversals by {\$}{\$}{\backslash}sim {\$}{\$} ? 80{\%}; given an unweighted Metis partitioning, by {\$}{\$}{\backslash}sim {\$}{\$} ? 30{\%}. These reductions are achieved within eight iterations and with the additional advantage of being workload-aware and usable online.",
issn="1573-7578",
doi="10.1007/s10619-017-7196-y",
url="http://dx.doi.org/10.1007/s10619-017-7196-y"
}
Graph partitioning has long been seen as a viable approach to addressing Graph DBMS scalability. A partitioning, however, may introduce extra query processing latency unless it is sensitive to a specific query workload, and optimised to minimise inter-partition traversals for that workload. Additionally, it should also be possible to incrementally adjust the partitioning in reaction to changes in the graph topology, the query workload, or both. Because of their complexity, current partitioning algorithms fall short of one or both of these requirements, as they are designed for offline use and as one-off operations. The TAPER system aims to address both requirements, whilst leveraging existing partitioning algorithms. TAPER takes any given initial partitioning as a starting point, and iteratively adjusts it by swapping chosen vertices across partitions, heuristically reducing the probability of inter-partition traversals for a given path queries workload. Iterations are inexpensive thanks to time and space optimisations in the underlying support data structures. We evaluate TAPER on two different large test graphs and over realistic query workloads. Our results indicate that, given a hash-based partitioning, TAPER reduces the number of inter-partition traversals by $}{$\sim $}{$ ? 80%; given an unweighted Metis partitioning, by $}{$\sim $}{$ ? 30%. These reductions are achieved within eight iterations and with the additional advantage of being workload-aware and usable online.
Revealing the Detailed Lineage of Script Outputs using Hybrid Provenance.
Zhang, Q.; Cao, Y.; Wang, Q.; Vu, D.; Thavasimani, P.; McPhillips, T.; Missier, P.; Slaughter, P.; Jones, C.; Jones, M. B; and Ludascher, B.
In Procs. 11th Intl. Digital Curation Conference (IDCC), Edinburgh, Scotland, UK, 2017. Digital Curation Center
Paper
link
bibtex
abstract
@inproceedings{Zhang2017,
abstract = {We illustrate how combining retrospective and prospective provenance can yield scientifically meaningful hybrid provenance representations of the computational histories of data produced during a script run. We use scripts from multiple disciplines (astrophysics, climate science, biodiversity data curation, and social network analysis), implemented in Python, R, and MATLAB, to highlight the usefulness of diverse forms of retrospective provenance when coupled with prospective provenance. Users provide prospective provenance (i.e., the conceptual workflows latent in scripts) via simple YesWorkflow annotations, embedded as script comments. Runtime observables, hidden in filenames or folder structures, recorded in log-files, or automatically captured using tools such as noWorkflow or the DataONE RunManagers can be linked to prospective provenance via relational views and queries. The YesWorkflow toolkit, example scripts, and demonstration code are available via an open source repository.},
address = {Edinburgh, Scotland, UK},
author = {Zhang, Qian and Cao, Yang and Wang, Qiwen and Vu, Duc and Thavasimani, Priyaa and McPhillips, Tim and Missier, Paolo and Slaughter, Peter and Jones, Christopher and Jones, Matthew B and Ludascher, Bertram},
booktitle = {Procs. 11th Intl. Digital Curation Conference (IDCC)},
file = {:Users/paolo/Documents/myGRID/refs/Zhang-Cao{\_}etal.pdf:pdf},
keywords = {{\#}provenance},
mendeley-tags = {{\#}provenance},
publisher = {Digital Curation Center},
title = {{Revealing the Detailed Lineage of Script Outputs using Hybrid Provenance}},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/Zhang-Cao_etal.pdf},
year = {2017}
}
We illustrate how combining retrospective and prospective provenance can yield scientifically meaningful hybrid provenance representations of the computational histories of data produced during a script run. We use scripts from multiple disciplines (astrophysics, climate science, biodiversity data curation, and social network analysis), implemented in Python, R, and MATLAB, to highlight the usefulness of diverse forms of retrospective provenance when coupled with prospective provenance. Users provide prospective provenance (i.e., the conceptual workflows latent in scripts) via simple YesWorkflow annotations, embedded as script comments. Runtime observables, hidden in filenames or folder structures, recorded in log-files, or automatically captured using tools such as noWorkflow or the DataONE RunManagers can be linked to prospective provenance via relational views and queries. The YesWorkflow toolkit, example scripts, and demonstration code are available via an open source repository.
2016
(10)
Clustering Provenance Facilitating Provenance Exploration Through Data Abstraction.
Karsai, L.; Fekete, A.; Kay, J.; and Missier, P.
In Proceedings of the Workshop on Human-In-the-Loop Data Analytics, of HILDA '16, pages 6:1—-6:5, New York, NY, USA, 2016. ACM
Paper
doi
link
bibtex
@inproceedings{Karsai:2016:CPF:2939502.2939508,
address = {New York, NY, USA},
author = {Karsai, Linus and Fekete, Alan and Kay, Judy and Missier, Paolo},
booktitle = {Proceedings of the Workshop on Human-In-the-Loop Data Analytics},
doi = {10.1145/2939502.2939508},
isbn = {978-1-4503-4207-0},
keywords = { provenance, visualisation,large-scale graphs},
pages = {6:1----6:5},
publisher = {ACM},
series = {HILDA '16},
title = {{Clustering Provenance Facilitating Provenance Exploration Through Data Abstraction}},
url = {http://doi.acm.org/10.1145/2939502.2939508},
year = {2016}
}
Alan Turing Intitute Symposium on Reproducibioity for Data-Intensive Research – Final Report.
Burgess, L. C; Crotty, D.; de Roure, D.; Gibbons, J.; Goble, C.; Missier, P.; Mortier, R.; Nichols, T. E; and O�Beirne, R.
. 2016.
Paper
link
bibtex
@article{burgess2016alan,
title={Alan Turing Intitute Symposium on Reproducibioity for Data-Intensive Research -- Final Report},
author={Burgess, Lucie C and Crotty, David and de Roure, David and Gibbons, Jeremy and Goble, Carole and Missier, Paolo and Mortier, Richard and Nichols, Thomas E and O�Beirne, Richard},
url={https://dx.doi.org/10.6084/m9.figshare.3487382},
year={2016}
}
The data, they are a-changin'.
Missier, P.; Cala, J.; and Wijaya, E.
In Cohen-Boulakia, S., editor(s), Proc. TAPP'16 (Theory and Practice of Provenance), Washington D.C., USA, 2016. USENIX Association
Paper
link
bibtex
abstract
@inproceedings{Paolo2016,
abstract = {The cost of deriving actionable knowledge from large datasets has been decreasing thanks to a convergence of positive fac- tors: low cost data generation, inexpensively scalable stor- age and processing infrastructure (cloud), software frame- works and tools for massively distributed data processing, and parallelisable data analytics algorithms. One observa- tion that is often overlooked, however, is that each of these elements is not immutable, rather they all evolve over time. This suggests that the value of such derivative knowledge may decay over time, unless it is preserved by reacting to those changes. Our broad research goal is to develop mod- els, methods, and tools for selectively reacting to changes by balancing costs and benefits, i.e. through complete or partial re-computation of some of the underlying processes. In this paper we present an initial model for reasoning about change and re-computations, and show how analysis of detailed provenance of derived knowledge informs re-computation decisions. We illustrate the main ideas through a real-world case study in genomics, namely on the interpretation of hu- man variants in support of genetic diagnosis.},
address = {Washington D.C., USA},
author = {Missier, Paolo and Cala, Jacek and Wijaya, Eldarina},
booktitle = {Proc. TAPP'16 (Theory and Practice of Provenance)},
editor = {Cohen-Boulakia, Sarah},
keywords = {{\#}big data processing,{\#}data change,{\#}provenance,{\#}re-computation},
mendeley-tags = {{\#}big data processing,{\#}data change,{\#}provenance,{\#}re-computation},
publisher = {USENIX Association},
title = {{The data, they are a-changin'}},
url = {https://arxiv.org/abs/1604.06412},
year = {2016}
}
The cost of deriving actionable knowledge from large datasets has been decreasing thanks to a convergence of positive fac- tors: low cost data generation, inexpensively scalable stor- age and processing infrastructure (cloud), software frame- works and tools for massively distributed data processing, and parallelisable data analytics algorithms. One observa- tion that is often overlooked, however, is that each of these elements is not immutable, rather they all evolve over time. This suggests that the value of such derivative knowledge may decay over time, unless it is preserved by reacting to those changes. Our broad research goal is to develop mod- els, methods, and tools for selectively reacting to changes by balancing costs and benefits, i.e. through complete or partial re-computation of some of the underlying processes. In this paper we present an initial model for reasoning about change and re-computations, and show how analysis of detailed provenance of derived knowledge informs re-computation decisions. We illustrate the main ideas through a real-world case study in genomics, namely on the interpretation of hu- man variants in support of genetic diagnosis.
Analyzing Provenance across Heterogeneous Provenance Graphs.
Oliveira, W.; Missier, P.; Ocana, K.; de Oliveira, D.; and Braganholo, V.
In Procs. IPAW 2016, Washington D.C., USA, 2016. Springer
link
bibtex
abstract
@inproceedings{Oliveira2016,
abstract = {Provenance generated by different workflow systems is generally ex- pressed using different formats. This is not an issue when scientists analyze provenance graphs in isolation, or when they use the same workflow system. However, when analyzing heterogeneous provenance graphs from multiple systems poses a challenge. To address this problem we adopt ProvONE as an integration model, and show how different provenance databases can be con- verted to a global ProvONE schema. Scientists can then query this integrated database, exploring and linking provenance across several different workflows that may represent different implementations of the same experiment. To illus- trate the feasibility of our approach, we developed conceptual mappings be- tween the provenance databases of two workflow systems (e-Science Central and SciCumulus). We provide cartridges that implement these mappings and generate an integrated provenance database expressed as Prolog facts. To demonstrate its usage, we have developed Prolog rules that enable scientists to query the integrated database.},
address = {Washington D.C., USA},
author = {Oliveira, Wellington and Missier, Paolo and Ocana, Kary and de Oliveira, Daniel and Braganholo, Vanessa},
booktitle = {Procs. IPAW 2016},
keywords = {{\#}provenance},
mendeley-tags = {{\#}provenance},
publisher = {Springer},
title = {{Analyzing Provenance across Heterogeneous Provenance Graphs}},
year = {2016}
}
Provenance generated by different workflow systems is generally ex- pressed using different formats. This is not an issue when scientists analyze provenance graphs in isolation, or when they use the same workflow system. However, when analyzing heterogeneous provenance graphs from multiple systems poses a challenge. To address this problem we adopt ProvONE as an integration model, and show how different provenance databases can be con- verted to a global ProvONE schema. Scientists can then query this integrated database, exploring and linking provenance across several different workflows that may represent different implementations of the same experiment. To illus- trate the feasibility of our approach, we developed conceptual mappings be- tween the provenance databases of two workflow systems (e-Science Central and SciCumulus). We provide cartridges that implement these mappings and generate an integrated provenance database expressed as Prolog facts. To demonstrate its usage, we have developed Prolog rules that enable scientists to query the integrated database.
Tracking Dengue Epidemics using Twitter Content Classification and Topic Modelling.
Missier, P.; Romanovsky, A; Miu, T; Pal, A; Daniilakis, M; Garcia, A; Cedrim, D; and Sousa, L
In Procs. SoWeMine workshop, co-located with ICWE 2016, Lugano, Switzerland, 2016.
Paper
link
bibtex
abstract
@inproceedings{Missier2016a,
abstract = {Detecting and preventing outbreaks of mosquito-borne diseases such as Dengue and Zika in Brasil and other tropical regions has long been a priority for governments in affected areas. Streaming social media content, such as Twit- ter, is increasingly being used for health vigilance applications such as flu detec- tion. However, previous work has not addressed the complexity of drastic sea- sonal changes on Twitter a across multiple epidemic outbreaks. In order to address this gap, this paper contrasts two complementary approaches to detecting Twitter content that is relevant for Dengue outbreak detection, namely supervised classification and unsupervised clustering using topic modelling. Each approach has benefits and shortcomings. Our classifier achieves a prediction accuracy of about 80{\%} based on a small training set of about 1,000 instances, but the need for manual annotation makes it hard to track seasonal changes in the nature of the epidemics, such as the emergence of new types of virus in certain geographical locations. In contrast, LDA-based topic modelling scales well, generating cohe- sive and well-separated clusters from larger samples. While clusters can be easily re-generated following changes in epidemics, however, this approach makes it hard to clearly segregate relevant tweets into well-defined clusters.},
address = {Lugano, Switzerland},
author = {Missier, Paolo and Romanovsky, A and Miu, T and Pal, A and Daniilakis, M and Garcia, A and Cedrim, D and Sousa, L},
booktitle = {Procs. SoWeMine workshop, co-located with ICWE 2016},
keywords = {{\#}social media analytics,{\#}twitter analytics},
mendeley-tags = {{\#}social media analytics,{\#}twitter analytics},
title = {{Tracking Dengue Epidemics using Twitter Content Classification and Topic Modelling}},
year = {2016},
url={http://arxiv.org/abs/1605.00968}
}
Detecting and preventing outbreaks of mosquito-borne diseases such as Dengue and Zika in Brasil and other tropical regions has long been a priority for governments in affected areas. Streaming social media content, such as Twit- ter, is increasingly being used for health vigilance applications such as flu detec- tion. However, previous work has not addressed the complexity of drastic sea- sonal changes on Twitter a across multiple epidemic outbreaks. In order to address this gap, this paper contrasts two complementary approaches to detecting Twitter content that is relevant for Dengue outbreak detection, namely supervised classification and unsupervised clustering using topic modelling. Each approach has benefits and shortcomings. Our classifier achieves a prediction accuracy of about 80% based on a small training set of about 1,000 instances, but the need for manual annotation makes it hard to track seasonal changes in the nature of the epidemics, such as the emergence of new types of virus in certain geographical locations. In contrast, LDA-based topic modelling scales well, generating cohe- sive and well-separated clusters from larger samples. While clusters can be easily re-generated following changes in epidemics, however, this approach makes it hard to clearly segregate relevant tweets into well-defined clusters.
Workload-aware streaming graph partitioning.
Firth, H.; and Missier, P.
In Procs. GraphQ Workshop, co-located with EDBT'16, Bordeaux, 2016.
link
bibtex
@inproceedings{Firth2016,
address = {Bordeaux},
author = {Firth, Hugo and Missier, Paolo},
booktitle = {Procs. GraphQ Workshop, co-located with EDBT'16},
title = {{Workload-aware streaming graph partitioning}},
year = {2016}
}
Data trajectories: tracking reuse of published data for transitive credit attribution.
Missier, P.
International Journal of Digital Curation, 11(1): 1–16. 2016.
Paper
paper
slides
doi
link
bibtex
abstract
@article{Missier2016,
abstract = {The ability to measure the use and impact of published data sets is key to the success of the open data / open science paradigm. A direct measure of impact would require tracking data (re)use in the wild, which however is difficult to achieve. This is therefore commonly replaced by simpler metrics based on data download and citation counts. In this paper we describe a scenario where it is possible to track the trajectory of a dataset after its publication, and we show how this enables the design of accurate models for ascribing credit to data originators. A Data Trajectory (DT) is a graph that encodes knowledge of how, by whom, and in which context data has been re-used, possibly after several generations. We provide a theoretical model of DTs that is grounded in the W3C PROV data model for provenance, and we show how DTs can be used to automatically propagate a fraction of the credit associated with transitively derived datasets, back to original data contributors. We also show this model of transitive credit in action by means of a Data Reuse Simulator. Ultimately, our hope is that, in the longer term, credit models based on direct measures of data reuse will provide further incentives to data publication. We conclude by outlining a research agenda to address the hard questions of creating, collecting, and using DTs systematically across a large number of data reuse instances, in the wild.},
address = {Amsterdam},
author = {Missier, Paolo},
doi = {doi:10.2218/ijdc.v11i1.425},
file = {:Users/paolo/Documents/myGRID/refs/DT.pdf:pdf},
journal = {International Journal of Digital Curation},
keywords = {data reuse,data trajectories,provenance},
mendeley-tags = {data reuse,data trajectories,provenance},
number = {1},
pages = {1--16},
publisher = {DCC},
title = {{Data trajectories: tracking reuse of published data for transitive credit attribution}},
url = {http://bibbase.org/network/publication/missier-datatrajectoriestrackingreuseofpublisheddatafortransitivecreditattribution-2016},
volume = {11},
year = {2016},
url_Paper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/425-1828-1-PB.pdf},
url_Slides={http://www.slideshare.net/pmissier/data-trajectories-tracking-the-reuse-of-published-datafor-transitive-credit-attribution}
}
The ability to measure the use and impact of published data sets is key to the success of the open data / open science paradigm. A direct measure of impact would require tracking data (re)use in the wild, which however is difficult to achieve. This is therefore commonly replaced by simpler metrics based on data download and citation counts. In this paper we describe a scenario where it is possible to track the trajectory of a dataset after its publication, and we show how this enables the design of accurate models for ascribing credit to data originators. A Data Trajectory (DT) is a graph that encodes knowledge of how, by whom, and in which context data has been re-used, possibly after several generations. We provide a theoretical model of DTs that is grounded in the W3C PROV data model for provenance, and we show how DTs can be used to automatically propagate a fraction of the credit associated with transitively derived datasets, back to original data contributors. We also show this model of transitive credit in action by means of a Data Reuse Simulator. Ultimately, our hope is that, in the longer term, credit models based on direct measures of data reuse will provide further incentives to data publication. We conclude by outlining a research agenda to address the hard questions of creating, collecting, and using DTs systematically across a large number of data reuse instances, in the wild.
Scalable and Efficient Whole-exome Data Processing Using Workflows on the Cloud.
Cala, J.; Marei, E.; Yu, Y.; Takeda, K.; and Missier, P.
Future Generation Computer Systems, In press(Special Issue: Big Data in the Cloud - Best paper award at the FGCS forum 2016). 2016.
Paper
link
bibtex
abstract
@article{Cala2015,
abstract = {Dataflow-style workflows offer a simple, high-level programming model for flexible prototyping of scientific applications as an attractive alternative to low-level scripting. At the same time, workflow management systems (WFMS) may support data parallelism over big datasets by providing scalable, distributed deployment and execution of the workflow over a cloud infrastructure. In theory, the combination of these properties makes workflows a natural choice for implementing Big Data processing pipelines, common for instance in bioinformatics. In practice, however, correct workflow design for parallel Big Data problems can be complex and very time-consuming. In this paper we present our experience in porting a genomics data processing pipeline from an existing scripted implementation deployed on a closed HPC cluster, to a workflow-based design deployed on the Microsoft Azure public cloud. We draw two contrasting and general conclusions from this project. On the positive side, we show that our solution based on the e-Science Central WFMS and deployed in the cloud clearly outperforms the original HPC-based implementation achieving up to 2.3x speed-up. However, in order to deliver such performance we describe the importance of optimising the workflow deployment model to best suit the characteristics of the cloud computing infrastructure. The main reason for the performance gains was the availability of fast, node-local SSD disks delivered by D-series Azure VMs combined with the implicit use of local disk resources by e-Science Central workflow engines. These conclusions suggest that, on parallel Big Data problems, it is important to couple understanding of the cloud computing architecture and its software stack with simplicity of design, and that further efforts in automating parallelisation of complex pipelines are required.},
author = {Cala, Jacek and Marei, Eyad and Yu, Yaobo and Takeda, Kenji and Missier, Paolo},
journal = {Future Generation Computer Systems},
keywords = {Cloud computing,HPC,Performance analysis,Whole-exome sequencing,Workflow-based application,cloud,genomics,workflow,?},
mendeley-groups = {Paolo-public},
mendeley-tags = {workflow,cloud,genomics},
number = {Special Issue: Big Data in the Cloud - Best paper award at the FGCS forum 2016},
publisher = {Elsevier},
title = {{Scalable and Efficient Whole-exome Data Processing Using Workflows on the Cloud}},
volume = {In press},
year = {2016},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/1-s2.0-S0167739X16000030-main.pdf}
}
Dataflow-style workflows offer a simple, high-level programming model for flexible prototyping of scientific applications as an attractive alternative to low-level scripting. At the same time, workflow management systems (WFMS) may support data parallelism over big datasets by providing scalable, distributed deployment and execution of the workflow over a cloud infrastructure. In theory, the combination of these properties makes workflows a natural choice for implementing Big Data processing pipelines, common for instance in bioinformatics. In practice, however, correct workflow design for parallel Big Data problems can be complex and very time-consuming. In this paper we present our experience in porting a genomics data processing pipeline from an existing scripted implementation deployed on a closed HPC cluster, to a workflow-based design deployed on the Microsoft Azure public cloud. We draw two contrasting and general conclusions from this project. On the positive side, we show that our solution based on the e-Science Central WFMS and deployed in the cloud clearly outperforms the original HPC-based implementation achieving up to 2.3x speed-up. However, in order to deliver such performance we describe the importance of optimising the workflow deployment model to best suit the characteristics of the cloud computing infrastructure. The main reason for the performance gains was the availability of fast, node-local SSD disks delivered by D-series Azure VMs combined with the implicit use of local disk resources by e-Science Central workflow engines. These conclusions suggest that, on parallel Big Data problems, it is important to couple understanding of the cloud computing architecture and its software stack with simplicity of design, and that further efforts in automating parallelisation of complex pipelines are required.
2015
(4)
Bootstrapping Personalised Human Activity Recognition Models Using Online Active Learning.
Miu, T.; Missier, P.; and Plötz, T.
In Proceedings of the 14th IEEE International Conference on Ubiquitous Computing and Communications, 2015.
Paper
link
bibtex
@INPROCEEDINGS{Miu2015,
author={Miu, T. and Missier, P. and Pl\"{o}tz, T.},
booktitle={Proceedings of the 14th IEEE International Conference on Ubiquitous Computing and Communications},
title={Bootstrapping Personalised Human Activity Recognition Models Using Online Active Learning},
year={2015},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/tudor-Liverpool.pdf}
}
Access control and view generation for provenance graphs.
Danger, R.; Curcin, V.; Missier, P.; and Bryans, J.
Future Generation Computer Systems, 49: 8–27. February 2015.
Paper
doi
link
bibtex
abstract
@article{Danger2015,
abstract = {Data provenance refers to the knowledge about data sources and operations carried out to obtain some piece of data. A provenance-enabled system maintains record of the interoperation of processes across different modules, stages and authorities to capture the full lineage of the resulting data, and typically allows data-focused audits using semantic technologies, such as ontologies, that capture domain knowledge. However, regulating access to captured provenance data is a non-trivial problem, since execution records form complex, overlapping graphs with individual nodes possibly being subject to different access policies. Applying traditional access control to provenance queries can either hide from the user the entire graph with nodes that had access to them denied, reveal too much information, or return a semantically invalid graph. An alternative approach is to answer queries with a new graph that abstracts over the missing nodes and fragments. In this paper, we present TACLP, an access control language for provenance data that supports this approach, together with an algorithm that transforms graphs according to sets of access restrictions. The algorithm produces safe and valid provenance graphs that retain the maximum amount of information allowed by the security model. The approach is demonstrated on an example of restricting access to a clinical trial provenance trace.},
author = {Danger, Roxana and Curcin, Vasa and Missier, Paolo and Bryans, Jeremy},
doi = {10.1016/j.future.2015.01.014},
issn = {0167739X},
journal = {Future Generation Computer Systems},
keywords = {Access Control Language,Provenance,Semantic Web},
month = feb,
pages = {8--27},
title = {{Access control and view generation for provenance graphs}},
url = {http://www.sciencedirect.com/science/article/pii/S0167739X1500031X},
volume = {49},
year = {2015}
}
Data provenance refers to the knowledge about data sources and operations carried out to obtain some piece of data. A provenance-enabled system maintains record of the interoperation of processes across different modules, stages and authorities to capture the full lineage of the resulting data, and typically allows data-focused audits using semantic technologies, such as ontologies, that capture domain knowledge. However, regulating access to captured provenance data is a non-trivial problem, since execution records form complex, overlapping graphs with individual nodes possibly being subject to different access policies. Applying traditional access control to provenance queries can either hide from the user the entire graph with nodes that had access to them denied, reveal too much information, or return a semantically invalid graph. An alternative approach is to answer queries with a new graph that abstracts over the missing nodes and fragments. In this paper, we present TACLP, an access control language for provenance data that supports this approach, together with an algorithm that transforms graphs according to sets of access restrictions. The algorithm produces safe and valid provenance graphs that retain the maximum amount of information allowed by the security model. The approach is demonstrated on an example of restricting access to a clinical trial provenance trace.
Recent advances in Scalable Workflow Enactment Engines and Technologies.
Hidders, J.; Missier, P.; and Sroka, J.
Future Generation Computer Systems, 46: 1–2. May 2015.
Paper
doi
link
bibtex
@article{Hidders2015,
author = {Hidders, Jan and Missier, Paolo and Sroka, Jacek},
doi = {10.1016/j.future.2015.01.003},
issn = {0167739X},
journal = {Future Generation Computer Systems},
month = may,
pages = {1--2},
title = {{Recent advances in Scalable Workflow Enactment Engines and Technologies}},
url = {http://www.sciencedirect.com/science/article/pii/S0167739X15000047},
volume = {46},
year = {2015}
}
2014
(9)
On Strategies for Budget-based Online Annotation in Human Activity Recognition.
Miu, T.; Plötz, T.; Missier, P.; and Roggen, D.
In Proceedings of the 2014 ACM International Joint Conference on Pervasive and Ubiquitous Computing: Adjunct Publication, of UbiComp '14 Adjunct, pages 767–776, New York, NY, USA, 2014. ACM
Paper
doi
link
bibtex
@inproceedings{Miu:2014:SBO:2638728.2641300,
address = {New York, NY, USA},
author = {Miu, Tudor and Pl\"{o}tz, Thomas and Missier, Paolo and Roggen, Daniel},
booktitle = {Proceedings of the 2014 ACM International Joint Conference on Pervasive and Ubiquitous Computing: Adjunct Publication},
doi = {10.1145/2638728.2641300},
isbn = {978-1-4503-3047-3},
keywords = { budget-based annotation, online learning,activity recognition},
pages = {767--776},
publisher = {ACM},
series = {UbiComp '14 Adjunct},
title = {{On Strategies for Budget-based Online Annotation in Human Activity Recognition}},
url = {http://doi.acm.org/10.1145/2638728.2641300},
year = {2014}
}
ProvGen: generating synthetic PROV graphs with predictable structure.
Firth, H.; and Missier, P.
In Procs. IPAW 2014 (Provenance and Annotations), Koln, Germany, 2014. Springer
Paper
link
bibtex
12 downloads
@inproceedings{Firth2014,
address = {Koln, Germany},
author = {Firth, Hugo and Missier, Paolo},
booktitle = {Procs. IPAW 2014 (Provenance and Annotations)},
publisher = {Springer},
title = {{ProvGen: generating synthetic PROV graphs with predictable structure}},
year = {2014},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/provGen2014.pdf}
}
From scripted HPC-based NGS pipelines to workflows on the cloud.
Cala, J.; Xu, Y. X.; Wijaya, E. A.; and Missier, P.
In Procs. C4Bio workshop, co-located with the 2014 CCGrid conference, Chicago, IL, 2014. IEEE
Paper
link
bibtex
@inproceedings{Cala2014,
address = {Chicago, IL},
author = {Cala, Jacek and Xu, Yaobo Xu and Wijaya, Eldarina Azfar and Missier, Paolo},
booktitle = {Procs. C4Bio workshop, co-located with the 2014 CCGrid conference},
keywords = {NGS,pipeline,scientific workflows,workflow},
mendeley-tags = {NGS,pipeline,scientific workflows,workflow},
publisher = {IEEE},
title = {{From scripted HPC-based NGS pipelines to workflows on the cloud}},
year = {2014},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/C4Bio.pdf}
}
The PBase Scientific Workflow Provenance Repository.
Cuevas-Vicenttín, V.; Kianmajd, P.; Ludäscher, B.; Missier, P.; Chirigati, F.; Wei, Y.; Koop, D.; and Dey, S.
In Procs. 9th International Digital Curation Conference, San Francisco, CA, USA, 2014.
Paper
link
bibtex
@inproceedings{Cuevas-Vicenttin2014,
address = {San Francisco, CA, USA},
author = {Cuevas-Vicentt\'{\i}n, V\'{\i}ctor and Kianmajd, Parisa and Lud\"{a}scher, Bertram and Missier, Paolo and Chirigati, Fernando and Wei, Yaxing and Koop, David and Dey, Saumen},
booktitle = {Procs. 9th International Digital Curation Conference},
file = {:Users/paolo/Documents/myGRID/refs/idcc14-pbasefinal.pdf:pdf},
keywords = {\#DataONE,\#provenance,\#workflow},
mendeley-tags = {\#DataONE,\#provenance,\#workflow},
title = {{The PBase Scientific Workflow Provenance Repository}},
year = {2014},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/idcc14-pbasefinal.pdf}
}
Distilling structure in Taverna scientific workflows: a refactoring approach.
Cohen-Boulakia, S.; Chen, J.; Missier, P.; Goble, C.; Williams, A.; and Froidevaux, C.
BMC Bioinformatics, 15(Suppl 1): S12. 2014.
Paper
doi
link
bibtex
abstract
1 download
@article{Cohen-Boulakia2014,
abstract = {BACKGROUND:Scientific workflows management systems are increasingly used to specify and manage bioinformatics experiments. Their programming model appeals to bioinformaticians, who can use them to easily specify complex data processing pipelines. Such a model is underpinned by a graph structure, where nodes represent bioinformatics tasks and links represent the dataflow. The complexity of such graph structures is increasing over time, with possible impacts on scientific workflows reuse. In this work, we propose effective methods for workflow design, with a focus on the Taverna model. We argue that one of the contributing factors for the difficulties in reuse is the presence of "anti-patterns", a term broadly used in program design, to indicate the use of idiomatic forms that lead to over-complicated design. The main contribution of this work is a method for automatically detecting such anti-patterns, and replacing them with different patterns which result in a reduction in the workflow's overall structural complexity. Rewriting workflows in this way will be beneficial both in terms of user experience (easier design and maintenance), and in terms of operational efficiency (easier to manage, and sometimes to exploit the latent parallelism amongst the tasks).RESULTS:We have conducted a thorough study of the workflows structures available in Taverna, with the aim of finding out workflow fragments whose structure could be made simpler without altering the workflow semantics. We provide four contributions. Firstly, we identify a set of anti-patterns that contribute to the structural workflow complexity. Secondly, we design a series of refactoring transformations to replace each anti-pattern by a new semantically-equivalent pattern with less redundancy and simplified structure. Thirdly, we introduce a distilling algorithm that takes in a workflow and produces a distilled semantically-equivalent workflow. Lastly, we provide an implementation of our refactoring approach that we evaluate on both the public Taverna workflows and on a private collection of workflows from the BioVel project.CONCLUSION:We have designed and implemented an approach to improving workflow structure by way of rewriting preserving workflow semantics. Future work includes considering our refactoring approach during the phase of workflow design and proposing guidelines for designing distilled workflows.},
author = {Cohen-Boulakia, Sarah and Chen, Jiuqiang and Missier, Paolo and Goble, Carole and Williams, Alan and Froidevaux, Christine},
doi = {10.1186/1471-2105-15-S1-S12},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Cohen-Boulakia et al. - 2014 - Distilling structure in Taverna scientific workflows a refactoring approach(2).pdf:pdf},
issn = {1471-2105},
journal = {BMC Bioinformatics},
keywords = {\#taverna,\#workflow},
mendeley-tags = {\#taverna,\#workflow},
number = {Suppl 1},
pages = {S12},
title = {{Distilling structure in Taverna scientific workflows: a refactoring approach}},
url = {http://www.biomedcentral.com/1471-2105/15/S1/S12},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/BMCBionf-Sarah.pdf},
volume = {15},
year = {2014}
}
BACKGROUND:Scientific workflows management systems are increasingly used to specify and manage bioinformatics experiments. Their programming model appeals to bioinformaticians, who can use them to easily specify complex data processing pipelines. Such a model is underpinned by a graph structure, where nodes represent bioinformatics tasks and links represent the dataflow. The complexity of such graph structures is increasing over time, with possible impacts on scientific workflows reuse. In this work, we propose effective methods for workflow design, with a focus on the Taverna model. We argue that one of the contributing factors for the difficulties in reuse is the presence of "anti-patterns", a term broadly used in program design, to indicate the use of idiomatic forms that lead to over-complicated design. The main contribution of this work is a method for automatically detecting such anti-patterns, and replacing them with different patterns which result in a reduction in the workflow's overall structural complexity. Rewriting workflows in this way will be beneficial both in terms of user experience (easier design and maintenance), and in terms of operational efficiency (easier to manage, and sometimes to exploit the latent parallelism amongst the tasks).RESULTS:We have conducted a thorough study of the workflows structures available in Taverna, with the aim of finding out workflow fragments whose structure could be made simpler without altering the workflow semantics. We provide four contributions. Firstly, we identify a set of anti-patterns that contribute to the structural workflow complexity. Secondly, we design a series of refactoring transformations to replace each anti-pattern by a new semantically-equivalent pattern with less redundancy and simplified structure. Thirdly, we introduce a distilling algorithm that takes in a workflow and produces a distilled semantically-equivalent workflow. Lastly, we provide an implementation of our refactoring approach that we evaluate on both the public Taverna workflows and on a private collection of workflows from the BioVel project.CONCLUSION:We have designed and implemented an approach to improving workflow structure by way of rewriting preserving workflow semantics. Future work includes considering our refactoring approach during the phase of workflow design and proposing guidelines for designing distilled workflows.
2013
(5)
Provenance and data differencing for workflow reproducibility analysis.
Missier, P.; Woodman, S.; Hiden, H.; and Watson, P.
Concurrency and Computation: Practice and Experience. 2013.
Paper
doi
link
bibtex
abstract
8 downloads
@article{CPE:CPE3035,
abstract = {One of the foundations of science is that researchers must publish the methodology used to achieve their results so that others can attempt to reproduce them. This has the added benefit of allowing methods to be adopted and adapted for other purposes. In the field of e-Science, services – often choreographed through workflow, process data to generate results. The reproduction of results is often not straightforward as the computational objects may not be made available or may have been updated since the results were generated. For example, services are often updated to fix bugs or improve algorithms. This paper addresses these problems in three ways. Firstly, it introduces a new framework to clarify the range of meanings of ‘reproducibility’. Secondly, it describes a new algorithm, PDIFF, that uses a comparison of workflow provenance traces to determine whether an experiment has been reproduced; the main innovation is that if this is not the case then the specific point(s) of divergence are identified through graph analysis, assisting any researcher wishing to understand those differences. One key feature is support for user-defined, semantic data comparison operators. Finally, the paper describes an implementation of PDIFF that leverages the power of the e-Science Central platform that enacts workflows in the cloud. As well as automatically generating a provenance trace for consumption by PDIFF, the platform supports the storage and reuse of old versions of workflows, data and services; the paper shows how this can be powerfully exploited to achieve reproduction and reuse. Copyright © 2013 John Wiley \& Sons, Ltd.},
author = {Missier, Paolo and Woodman, Simon and Hiden, Hugo and Watson, Paul},
doi = {10.1002/cpe.3035},
file = {:Users/paolo/Documents/Newcastle/CURRENT/repro-paper-CCPE-2012/CCPE-2012.pdf:pdf},
issn = {1532-0634},
journal = {Concurrency and Computation: Practice and Experience},
keywords = {e-science,provenance,reproducibility,scientific workflow},
title = {{Provenance and data differencing for workflow reproducibility analysis}},
url = {http://dx.doi.org/10.1002/cpe.3035},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/CCPE-2012.pdf},
year = {2013}
}
One of the foundations of science is that researchers must publish the methodology used to achieve their results so that others can attempt to reproduce them. This has the added benefit of allowing methods to be adopted and adapted for other purposes. In the field of e-Science, services – often choreographed through workflow, process data to generate results. The reproduction of results is often not straightforward as the computational objects may not be made available or may have been updated since the results were generated. For example, services are often updated to fix bugs or improve algorithms. This paper addresses these problems in three ways. Firstly, it introduces a new framework to clarify the range of meanings of ‘reproducibility’. Secondly, it describes a new algorithm, PDIFF, that uses a comparison of workflow provenance traces to determine whether an experiment has been reproduced; the main innovation is that if this is not the case then the specific point(s) of divergence are identified through graph analysis, assisting any researcher wishing to understand those differences. One key feature is support for user-defined, semantic data comparison operators. Finally, the paper describes an implementation of PDIFF that leverages the power of the e-Science Central platform that enacts workflows in the cloud. As well as automatically generating a provenance trace for consumption by PDIFF, the platform supports the storage and reuse of old versions of workflows, data and services; the paper shows how this can be powerfully exploited to achieve reproduction and reuse. Copyright © 2013 John Wiley & Sons, Ltd.
D-PROV: extending the PROV provenance model with workflow structure.
Missier, P.; Dey, S.; Belhajjame, K.; Cuevas, V.; and Ludaescher, B.
In Procs. TAPP'13, Lombard, IL, 2013.
Paper
link
bibtex
6 downloads
@inproceedings{Missier2013a,
address = {Lombard, IL},
author = {Missier, Paolo and Dey, Saumen and Belhajjame, Khalid and Cuevas, Victor and Ludaescher, Bertram},
booktitle = {Procs. TAPP'13},
keywords = {PROV,workflow-provenance},
mendeley-tags = {PROV,workflow-provenance},
title = {{D-PROV}: extending the {PROV} provenance model with workflow structure},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/D-PROV-TAPP-2013.pdf},
year = {2013}
}
2012
(8)
Report from the first workshop on Scalable Workflow Enactment Engines and Technology (SWEET'12) .
Hidders, J.; Sroka, J.; and Missier, P.
In SIGMOD Record, volume 41. December 2012.
Paper
link
bibtex
@InCollection{SWEETReport2011,
author = {Hidders, Jan and Sroka, Jacek and Missier, Paolo},
title = {Report from the first workshop on Scalable Workflow Enactment Engines and Technology (SWEET'12) },
booktitle = {SIGMOD Record},
url={http://www.sigmod.org/publications/sigmod-record/1212/pdfs/12.report.hidders.pdf},
volume={41},
issue={4},
month={December},
year={2012}
}
SWEET '12: Proceedings of the 1st ACM SIGMOD Workshop on Scalable Workflow Execution Engines and Technologies.
Hidders, J.; Missier, P.; and Sroka, J.,
editors.
SIGMOD Record. New York, NY, USA, December 2012.
Paper
link
bibtex
1 download
@proceedings{Hidders:2012:2443416,
address = {New York, NY, USA},
editor = {Hidders, Jan and Missier, Paolo and Sroka, Jacek},
isbn = {978-1-4503-1876-1},
publisher = {SIGMOD Record},
title = {{SWEET '12: Proceedings of the 1st ACM SIGMOD Workshop on Scalable Workflow Execution Engines and Technologies}},
url = {http://dl.acm.org/citation.cfm?id=2443416},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/SweetReport.pdf},
month={December},
year = {2012}
}
Modelling Provenance using Structured Occurrence Networks.
Missier, P; Randell, B; and Koutny, M
In Proc. IPAW'12, Santa Barbara, California, 2012. Springer-Verlag, Lecture Notes in Computer Science
Paper
link
bibtex
7 downloads
@inproceedings{Missier2012,
address = {Santa Barbara, California},
author = {Missier, P and Randell, B and Koutny, M},
booktitle = {Proc. IPAW'12},
keywords = {\#provenance},
mendeley-tags = {\#provenance},
publisher = {Springer-Verlag, Lecture Notes in Computer Science},
title = {{Modelling Provenance using Structured Occurrence Networks}},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/IPAW12-040312.pdf},
year = {2012}
}
A PROV encoding for provenance analysis using deductive rules.
Missier, P; and Belhajjame, K.
In Procs. IPAW'12, Santa Barbara, California, 2012. Springer-Verlag, Lecture Notes in Computer Science
Paper
link
bibtex
@inproceedings{Missier2012a,
address = {Santa Barbara, California},
author = {Missier, P and Belhajjame, K.},
booktitle = {Procs. IPAW'12},
keywords = {\#PROV,\#datalog,\#provenance},
publisher = {Springer-Verlag, Lecture Notes in Computer Science},
title = {{A PROV encoding for provenance analysis using deductive rules}},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/IPAW2012-datalog.pdf},
year = {2012}
}
Detecting Duplicate Records in Scientific Workflow Results.
Belhajjame, K.; Missier, P; and Goble, C
In Procs. IPAW'12, Santa Barbara, California, 2012. Springer-Verlag, Lecture Notes in Computer Science
link
bibtex
@inproceedings{Missier2012b,
address = {Santa Barbara, California},
author = {Belhajjame, K. and Missier, P and Goble, C},
booktitle = {Procs. IPAW'12},
keywords = {\#PROV,\#provenance},
publisher = {Springer-Verlag, Lecture Notes in Computer Science},
title = {{Detecting Duplicate Records in Scientific Workflow Results}},
year = {2012}
}
Principles of Provenance (Dagstuhl Seminar 12091).
Cheney, J.; Finkelstein, A.; Ludaescher, B.; and Vansummeren, S.
Dagstuhl Reports, 2(2): 84–113. 2012.
Paper
doi
link
bibtex
1 download
@article{cheney_et_al:DR:2012:3507,
address = {Dagstuhl, Germany},
annote = {Keywords: Provenance, Lineage, Metadata, Trust, Repeatability, Accountability},
author = {Cheney, James and Finkelstein, Anthony and Ludaescher, Bertram and Vansummeren, Stijn},
doi = {http://dx.doi.org/10.4230/DagRep.2.2.84},
editor = {Cheney, James and Finkelstein, Anthony and Ludaescher, Bertram and Vansummeren, Stijn},
issn = {2192-5283},
journal = {Dagstuhl Reports},
keywords = {\#provenance},
mendeley-tags = {\#provenance},
number = {2},
pages = {84--113},
publisher = {Schloss Dagstuhl--Leibniz-Zentrum fuer Informatik},
title = {{Principles of Provenance (Dagstuhl Seminar 12091)}},
url = {http://drops.dagstuhl.de/opus/volltexte/2012/3507},
volume = {2},
year = {2012}
}
Golden Trail: Retrieving the Data History that Matters from a Comprehensive Provenance Repository.
Missier, P.; Ludascher, B.; Bowers, S.; Altintas, I.; Dey, S.; and Agun, M.
International Journal of Digital Curation, 7(1). 2012.
Paper
link
bibtex
1 download
@article{Missier2011c,
address = {Bristol,UK},
author = {Missier, Paolo and Ludascher, Bertram and Bowers, Shawn and Altintas, Ilkay and Dey, Saumen and Agun, Michael},
file = {:Users/paolo/Downloads/221-927-1-PB.pdf:pdf},
journal = {International Journal of Digital Curation},
keywords = {\#provenance,\#repository,\#workflow},
mendeley-tags = {\#provenance,\#repository,\#workflow},
number = {1},
publisher = {UKOLN},
title = {{Golden Trail: Retrieving the Data History that Matters from a Comprehensive Provenance Repository}},
url = {http://www.dcc.ac.uk/events/idcc11},
volume = {7},
year = {2012},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/221-927-1-PB.pdf}
}
2011
(8)
Achieving Reproducibility by Combining Provenance with Service and Workflow Versioning.
Woodman, S.; Hiden, H.; Watson, P.; and Missier, P.
In Procs. WORKS 2011, Seattle, WA, USA, 2011.
link
bibtex
@inproceedings{Woodman2011,
address = {Seattle, WA, USA},
author = {Woodman, Simon and Hiden, Hugo and Watson, Paul and Missier, Paolo},
booktitle = {Procs. WORKS 2011},
keywords = {cloud,provenance,reproducible science,workflow},
mendeley-tags = {cloud,provenance,reproducible science,workflow},
title = {{Achieving Reproducibility by Combining Provenance with Service and Workflow Versioning}},
year = {2011}
}
Towards the preservation of scientific workflows.
Roure, D. D.; Belhajjame, K.; Missier, P.; and Al., E.
In Procs. of the 8th International Conference on Preservation of Digital Objects (iPRES 2011), Singapore, 2011.
Paper
link
bibtex
@inproceedings{Roure2011,
address = {Singapore},
author = {Roure, David De and Belhajjame, Khalid and Missier, Paolo and Al., Et},
booktitle = {Procs. of the 8th International Conference on Preservation of Digital Objects (iPRES 2011)},
title = {{Towards the preservation of scientific workflows}},
year = {2011},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/wfpreservev12.pdf}
}
Why linked data is not enough for scientists.
Bechhofer, S.; Buchan, I.; De Roure, D.; Missier, P.; and Al., E.
Future Generation Computer Systems (FGCS). 2011.
Paper
doi
link
bibtex
2 downloads
@article{Bechhofer2011,
author = {Bechhofer, Sean and Buchan, Iain and {De Roure}, David and Missier, Paolo and Al., Et},
doi = {doi:10.1016/j.future.2011.08.004},
journal = {Future Generation Computer Systems (FGCS)},
publisher = {Elsevier},
title = {{Why linked data is not enough for scientists}},
url = {http://www.sciencedirect.com/science/article/pii/S0167739X11001439},
year = {2011}
}
Incremental workflow improvement through analysis of its data provenance.
Missier, P.
In Procs. TAPP'11 (Theory and Practice of Provenance), Heraklyion, Crete, Greece, June 2011.
Paper
link
bibtex
abstract
@inproceedings{Missier2011a,
abstract = {Repeated executions of resource-intensive workflows over a large number of runs are commonly observed in e-science practice. We explore the hypothesis that, in some cases, provenance traces recorded for past runs of a workflow can be used to make future runs more efficient. This investigation is an initial step into the systematic study of the role that provenance analysis can play in the broader context of self-managing software systems. We have tested our hypothesis on a concrete case study involving a Chemical Engineering workflow deployed on a cloud infrastructure, where we can measure the cost of its repeated execution. Our approach involves augmenting the workflow with a feedback loop in which incremental analysis of the provenance of past runs is used to control some of the workflow steps in subsequent executions. We present initial experimental results and hint at future improvements as part of ongoing work.},
address = {Heraklyion, Crete, Greece},
author = {Missier, Paolo},
booktitle = {Procs. TAPP'11 (Theory and Practice of Provenance)},
file = {:Users/paolo/Dropbox/Provenance-mining/TAPP11/TAPP-missier.pdf:pdf},
title = {{Incremental workflow improvement through analysis of its data provenance}},
year = {2011},
month={June},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/TAPP-missier.pdf}
}
Repeated executions of resource-intensive workflows over a large number of runs are commonly observed in e-science practice. We explore the hypothesis that, in some cases, provenance traces recorded for past runs of a workflow can be used to make future runs more efficient. This investigation is an initial step into the systematic study of the role that provenance analysis can play in the broader context of self-managing software systems. We have tested our hypothesis on a concrete case study involving a Chemical Engineering workflow deployed on a cloud infrastructure, where we can measure the cost of its repeated execution. Our approach involves augmenting the workflow with a feedback loop in which incremental analysis of the provenance of past runs is used to control some of the workflow steps in subsequent executions. We present initial experimental results and hint at future improvements as part of ongoing work.
Simulating Taverna workflows using stochastic process algebras.
Curcin, V.; Missier, P.; and De Roure, D.
Concurrency and Computation: Practice and Experience, In press.. 2011.
Paper
link
bibtex
@article{Curcin2011,
author = {Curcin, Vasa and Missier, Paolo and {De Roure}, David},
file = {:Users/paolo/Documents/myGRID/papers/Simulating Taverna\_v6.pdf:pdf},
journal = {Concurrency and Computation: Practice and Experience},
title = {{Simulating Taverna workflows using stochastic process algebras}},
volume = {In press.},
year = {2011},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/Simulating%20Taverna_v6.pdf}
}
Workflows to Open Provenance Graphs, round-trip.
Missier, P.; and Goble, C.
Future Generation Computer Systems (FGCS), 27(6): 812–819. April 2011.
Paper
doi
link
bibtex
abstract
@article{Missier2011,
abstract = {The Open Provenance Model is designed to capture relationships amongst data values, and amongst processors that produce or consume those values. While OPM graphs are able to describe aspects of a workflow execution, capturing the structure of the workflows themselves is understandably beyond the scope of the OPM specification, since the graphs may be generated by a broad variety of processes, which may not be formal workflows at all. \% In particular, OPM does not address two questions: firstly, whether for any OPM graph there exists a $\backslash$textit\{plausible\} workflow, in some model, which could have generated the graph. And secondly, which information should be captured as part of an OPM graph that is derived from the execution of some known type of workflow, so that the workflow structure and the execution trace can both be inferred back from the graph. \% Motivated by the need to address the $\backslash$textit\{Third Provenance Challenge\} using Taverna workflows and provenance, in this paper we explore such notion of $\backslash$textit\{lossless-ness\} of OPM graphs relative to Taverna workflows. \% For the first question, we show that Taverna is a suitable model for representing plausible OPM-generating processes. For the second question, we show how augmenting OPM with two types of annotations makes it lossless with respect to Taverna. We support this claim by presenting a two-way mapping between OPM graphs and Taverna workflows.},
author = {Missier, Paolo and Goble, Carole},
doi = {http://dx.doi.org/10.1016/j.future.2010.10.012},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Missier, Goble - 2011 - Workflows to Open Provenance Graphs, round-trip(2).pdf:pdf;:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Missier, Goble - 2011 - Workflows to Open Provenance Graphs, round-trip.pdf:pdf},
journal = {Future Generation Computer Systems (FGCS)},
keywords = {\#OPM,\#provenance,OPM},
mendeley-tags = {\#OPM,\#provenance},
number = {6},
pages = {812--819},
publisher = {Elsevier},
title = {{Workflows to Open Provenance Graphs, round-trip}},
volume = {27},
year = {2011},
month={April},
urlpaper={http://www.mendeley.com/download/personal/212462/3538519411/e3fb0fd4c581dc383cb202183f4773dd48ff8c55/dl.pdf}
}
The Open Provenance Model is designed to capture relationships amongst data values, and amongst processors that produce or consume those values. While OPM graphs are able to describe aspects of a workflow execution, capturing the structure of the workflows themselves is understandably beyond the scope of the OPM specification, since the graphs may be generated by a broad variety of processes, which may not be formal workflows at all. % In particular, OPM does not address two questions: firstly, whether for any OPM graph there exists a $\$textit\plausible\ workflow, in some model, which could have generated the graph. And secondly, which information should be captured as part of an OPM graph that is derived from the execution of some known type of workflow, so that the workflow structure and the execution trace can both be inferred back from the graph. % Motivated by the need to address the $\$textit\Third Provenance Challenge\ using Taverna workflows and provenance, in this paper we explore such notion of $\$textit\lossless-ness\ of OPM graphs relative to Taverna workflows. % For the first question, we show that Taverna is a suitable model for representing plausible OPM-generating processes. For the second question, we show how augmenting OPM with two types of annotations makes it lossless with respect to Taverna. We support this claim by presenting a two-way mapping between OPM graphs and Taverna workflows.
Extending Semantic Provenance into the Web of Data.
Zhao, J.; Sahoo, S. S; Missier, P.; Sheth, A.; and Goble, C.
IEEE Internet Computing, 15: 40–48. 2011.
Paper
doi
link
bibtex
@article{10.1109/MIC.2011.7,
address = {Los Alamitos, CA, USA},
author = {Zhao, Jun and Sahoo, Satya S and Missier, Paolo and Sheth, Amit and Goble, Carole},
doi = {http://doi.ieeecomputersociety.org/10.1109/MIC.2011.7},
issn = {1089-7801},
journal = {IEEE Internet Computing},
pages = {40--48},
publisher = {IEEE Computer Society},
url={http://doi.ieeecomputersociety.org/10.1109/MIC.2011.7},
title = {{Extending Semantic Provenance into the Web of Data}},
volume = {15},
year = {2011}
}
2010
(11)
Why Linked Data is Not Enough for Scientists.
Bechhofer, S.; Ainsworth, J.; Bhagat, J.; Buchan, I.; Couch, P.; Cruickshank, D.; Roure, D. D.; Delderfield, M.; Dunlop, I.; Gamble, M.; Goble, C.; Michaelides, D.; Missier, P.; Owen, S.; Newman, D.; and Sufi, S.
In e-Science (e-Science), 2010 IEEE Sixth International Conference on, pages 300–307, 2010.
Paper
doi
link
bibtex
@inproceedings{5693931,
author = {Bechhofer, Sean and Ainsworth, John and Bhagat, Jiten and Buchan, Iain and Couch, Philip and Cruickshank, Don and Roure, David De and Delderfield, Mark and Dunlop, Ian and Gamble, Matthew and Goble, Carole and Michaelides, Danius and Missier, Paolo and Owen, Stuart and Newman, David and Sufi, Shoaib},
booktitle = {e-Science (e-Science), 2010 IEEE Sixth International Conference on},
doi = {10.1109/eScience.2010.21},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Bechhofer et al. - 2010 - Why Linked Data is Not Enough for Scientists.pdf:pdf},
pages = {300--307},
title = {{Why Linked Data is Not Enough for Scientists}},
url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=5693931},
year = {2010},
urlpaper={http://www.mendeley.com/download/public/212462/3654049861/b5af47a9fee58a215547315f8f82a4efce0e27ea/dl.pdf}
}
Linking Multiple Workflow Provenance Traces for Interoperable Collaborative Science.
Missier, P.; Ludascher, B.; Bowers, S.; Anand, M. K.; Altintas, I.; Dey, S.; Sarkar, A.; Shrestha, B.; and Goble, C.
In Proc.s 5th Workshop on Workflows in Support of Large-Scale Science (WORKS), 2010.
Paper
link
bibtex
abstract
@inproceedings{Missier2010e,
abstract = {Scientific collaboration increasingly involves data sharing between separate groups. We consider a scenario where data products of scientific workflows are published and then used by other researchers as inputs to their workflows. For proper interpretation, shared data must be complemented by descriptive metadata. We focus on provenance traces, a prime example of such metadata which describes the genesis and processing history of data products in terms of the computational workflow steps. Through the reuse of published data, virtual, implicitly collaborative experiments emerge, making it desirable to compose the independently generated traces into global ones that describe the combined executions as single, seamless experiments. We present a model for provenance sharing that realizes this holistic view by overcoming the various interoperability problems that emerge from the heterogeneity of workflow systems, data formats, and provenance models. At the heart lie (i) an abstract workflow and provenance model in which (ii) data sharing becomes itself part of the combined workflow. We then describe an implementation of our model that we developed in the context of the Data Observation Network for Earth (DataONE) project and that can “stitch together” traces from different Kepler and Taverna workflow runs. It provides a prototypical framework for seamless cross-system, collaborative provenance management and can be easily extended to include other systems. Our approach also opens the door to new ways of workflow interoperability not only through often elusive workflow standards but through shared provenance information from public repositories.},
author = {Missier, Paolo and Ludascher, Bertram and Bowers, Shawn and Anand, Manish Kumar and Altintas, Ilkay and Dey, Saumen and Sarkar, Anandarup and Shrestha, Biva and Goble, Carole},
booktitle = {Proc.s 5th Workshop on Workflows in Support of Large-Scale Science (WORKS)},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Missier et al. - 2010 - Linking Multiple Workflow Provenance Traces for Interoperable Collaborative Science.pdf:pdf},
title = {{Linking Multiple Workflow Provenance Traces for Interoperable Collaborative Science}},
year = {2010},
urlpaper={http://homepages.cs.ncl.ac.uk/paolo.missier/doc/WORKS10.pdf}
}
Scientific collaboration increasingly involves data sharing between separate groups. We consider a scenario where data products of scientific workflows are published and then used by other researchers as inputs to their workflows. For proper interpretation, shared data must be complemented by descriptive metadata. We focus on provenance traces, a prime example of such metadata which describes the genesis and processing history of data products in terms of the computational workflow steps. Through the reuse of published data, virtual, implicitly collaborative experiments emerge, making it desirable to compose the independently generated traces into global ones that describe the combined executions as single, seamless experiments. We present a model for provenance sharing that realizes this holistic view by overcoming the various interoperability problems that emerge from the heterogeneity of workflow systems, data formats, and provenance models. At the heart lie (i) an abstract workflow and provenance model in which (ii) data sharing becomes itself part of the combined workflow. We then describe an implementation of our model that we developed in the context of the Data Observation Network for Earth (DataONE) project and that can “stitch together” traces from different Kepler and Taverna workflow runs. It provides a prototypical framework for seamless cross-system, collaborative provenance management and can be easily extended to include other systems. Our approach also opens the door to new ways of workflow interoperability not only through often elusive workflow standards but through shared provenance information from public repositories.
The Open Provenance Model — Core Specification (v1.1).
Moreau, L.; Clifford, B.; Freire, J.; Futrelle, J.; Gil, Y.; Groth, P.; Kwasnikowska, N.; Miles, S.; Missier, P.; Myers, J.; Plale, B.; Simmhan, Y.; Stephan, E.; and Van Den Bussche, J.
Future Generation Computer Systems. 2010.
Paper
doi
link
bibtex
@article{Moreau2010a,
author = {Moreau, Luc and Clifford, Ben and Freire, Juliana and Futrelle, Joe and Gil, Yolanda and Groth, Paul and Kwasnikowska, Natalia and Miles, Simon and Missier, Paolo and Myers, Jim and Plale, Beth and Simmhan, Yogesh and Stephan, Eric and {Van Den Bussche}, Jan},
doi = {http://dx.doi.org/10.1016/j.future.2010.07.005},
file = {:Users/paolo/Documents/myGRID/refs/opm.pdf:pdf},
journal = {Future Generation Computer Systems},
title = {{The Open Provenance Model --- Core Specification (v1.1)}},
year = {2010},
urlpaper={http://www.mendeley.com/download/public/212462/3464695141/849c7a26f20b3fa9338bc8d6ab292b6c8332cf8d/dl.pdf}
}
Seamless Provenance Representation and Use in Collaborative Science Scenarios (Abstract).
Missier, P.; Ludascher, B.; Bowers, S.; Anand, M. K.; Altintas, I.; Dey, S.; Sarkar, A.; Shrestha, B.; and Goble, C.
In AGU Fall Meeting, San Francisco, CA, USA, 2010.
Paper
link
bibtex
@inproceedings{Missier2010d,
address = {San Francisco, CA, USA},
author = {Missier, Paolo and Ludascher, Bertram and Bowers, Shawn and Anand, Manish Kumar and Altintas, Ilkay and Dey, Saumen and Sarkar, Anandarup and Shrestha, Biva and Goble, Carole},
booktitle = {AGU Fall Meeting},
file = {:Users/paolo/Documents/DataONE/AGU 2010 abstracts/AGU-IN02 submission.pdf:pdf},
title = {{Seamless Provenance Representation and Use in Collaborative Science Scenarios (Abstract)}},
year = {2010},
urlpaper = {http://www.mendeley.com/download/public/212462/1436102841/a3435cce7917d1995fdb70485b20efa0d3ffdf81/dl.pdf}
}
A comparison of using Taverna and BPEL in building scientific workflows: the case of caGrid.
Tan, W.; Missier, P.; Foster, I.; Madduri, R.; De Roure, D.; and Goble, C.
Concurrency and Computation: Practice and Experience, 22(9): 1098–1117. 2010.
Paper
doi
link
bibtex
@article{CPE:CPE1547,
author = {Tan, Wei and Missier, Paolo and Foster, Ian and Madduri, Ravi and {De Roure}, David and Goble, Carole},
doi = {10.1002/cpe.1547},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Tan et al. - 2010 - A comparison of using Taverna and BPEL in building scientific workflows the case of caGrid.pdf:pdf},
issn = {1532-0634},
journal = {Concurrency and Computation: Practice and Experience},
keywords = {BPEL,Taverna,caGrid,functional programming,scientific workflow},
number = {9},
pages = {1098--1117},
publisher = {John Wiley \& Sons, Ltd.},
title = {{A comparison of using Taverna and BPEL in building scientific workflows: the case of caGrid}},
url = {http://dx.doi.org/10.1002/cpe.1547},
volume = {22},
year = {2010}
}
Fine-grained and efficient lineage querying of collection-based workflow provenance.
Missier, P.; Paton, N.; and Belhajjame, K.
In Procs. EDBT, Lausanne, Switzerland, 2010.
Paper
link
bibtex
1 download
@inproceedings{Missier2010a,
address = {Lausanne, Switzerland},
author = {Missier, P. and Paton, N. and Belhajjame, K.},
booktitle = {Procs. EDBT},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Missier, Paton, Belhajjame - 2010 - Fine-grained and efficient lineage querying of collection-based workflow provenance.pdf:pdf},
title = {{Fine-grained and efficient lineage querying of collection-based workflow provenance}},
year = {2010},
urlpaper={http://www.mendeley.com/download/public/212462/1436102841/a3435cce7917d1995fdb70485b20efa0d3ffdf81/dl.pdf}
}
ERGOT: A Semantic-based System for Service Discovery in Distributed Infrastructures.
Pirro', G.; Trunfio, P.; Talia, D.; Missier, P.; and Goble, C.
In Procs. CCGRID '10, Melbourne, Australia, 2010.
doi
link
bibtex
@inproceedings{pirro10:,
address = {Melbourne, Australia},
author = {Pirro', Giuseppe and Trunfio, Paolo and Talia, Domenico and Missier, Paolo and Goble, Carole},
booktitle = {Procs. CCGRID '10},
doi = {http://dx.doi.org/10.1109/CCGRID.2010.24},
file = {:Users/paolo/Documents/myGRID/refs/p299-missier.pdf:pdf},
title = {{ERGOT: A Semantic-based System for Service Discovery in Distributed Infrastructures}},
year = {2010}
}
Functional Units: Abstractions for Web Service Annotations.
Missier, P.; Wolstencroft, K.; Tanoh, F.; Li, P.; Bechhofer, S.; Belhajjame, K.; and Goble, C.
In Procs. IEEE 2010 Fourth International Workshop on Scientific Workflows (SWF 2010), Miami, FL, 2010.
Paper
link
bibtex
@inproceedings{Missier2010,
address = {Miami, FL},
author = {Missier, Paolo and Wolstencroft, Katy and Tanoh, Franck and Li, Peter and Bechhofer, Sean and Belhajjame, Khalid and Goble, Carole},
booktitle = {Procs. IEEE 2010 Fourth International Workshop on Scientific Workflows (SWF 2010)},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Missier et al. - 2010 - Functional Units Abstractions for Web Service Annotations.pdf:pdf},
keywords = {service annotations biocatalogue},
title = {{Functional Units: Abstractions for Web Service Annotations}},
url = {http://www.cs.wayne.edu/\~{}shiyong/swf/swf2010.html},
year = {2010},
urlpaper={http://www.mendeley.com/download/public/212462/2916934991/f6fa568c0c07692423a097a6083e94909b3bbcd6/dl.pdf}
}
Taverna, reloaded.
Missier, P.; Soiland-Reyes, S.; Owen, S.; Tan, W.; Nenadic, A.; Dunlop, I.; Williams, A.; Oinn, T.; and Goble, C.
In Gertz, M; Hey, T; and Ludaescher, B, editor(s), Procs. SSDBM 2010, Heidelberg, Germany, 2010.
Paper
link
bibtex
2 downloads
@inproceedings{Missier2010b,
address = {Heidelberg, Germany},
author = {Missier, Paolo and Soiland-Reyes, Stian and Owen, Stuart and Tan, Wei and Nenadic, Alex and Dunlop, Ian and Williams, Alan and Oinn, Tom and Goble, Carole},
booktitle = {Procs. SSDBM 2010},
editor = {Gertz, M and Hey, T and Ludaescher, B},
file = {:Users/paolo/Documents/myGRID/papers/T2Performance/short-SSDBM/Missier-CR/T2Architecture-SSDBM-CR.pdf:pdf},
title = {{Taverna, reloaded}},
url = {http://www.ssdbm2010.org/},
year = {2010},
urlpaper={http://www.mendeley.com/download/public/212462/2916934941/e91110015e961be177247d40c8f5dc874367106b/dl.pdf}
}
Janus: from Workflows to Semantic Provenance and Linked Open Data.
Missier, P.; Sahoo, S. S; Zhao, J.; Sheth, A.; and Goble, C.
In Procs. IPAW 2010, Troy, NY, 2010.
Paper
link
bibtex
@inproceedings{Missier2010c,
address = {Troy, NY},
author = {Missier, Paolo and Sahoo, Satya S and Zhao, Jun and Sheth, Amit and Goble, Carole},
booktitle = {Procs. IPAW 2010},
file = {:Users/paolo/Dropbox/Janus/paper-IPAW2010/SP-IPAW10.pdf:pdf},
keywords = {provenance Taverna LOD RDF semantics},
title = {{Janus: from Workflows to Semantic Provenance and Linked Open Data}},
year = {2010},
urlpaper={http://www.mendeley.com/download/public/212462/2928549431/f166661dce9df21dd0c321fbffb0fb6eb4c85cd1/dl.pdf}
}
Understanding Collaborative Studies Through Interoperable Workflow Provenance.
Altintas, I.; Anand, M. K.; Crawl, D.; Belloum, A.; Missier, P.; Goble, C.; and Sloot, P.
In Procs. IPAW 2010, Troy, NY, 2010.
Paper
link
bibtex
@inproceedings{Altintas2010a,
address = {Troy, NY},
author = {Altintas, Ilkay and Anand, Manish Kumar and Crawl, Daniel and Belloum, Adam and Missier, Paolo and Goble, Carole and Sloot, Peter},
booktitle = {Procs. IPAW 2010},
file = {:Users/paolo/Documents/myGRID/refs/IPAW2010-CP.pdf:pdf},
keywords = {provenance interoperability},
title = {{Understanding Collaborative Studies Through Interoperable Workflow Provenance}},
year = {2010},
urlpaper={http://www.mendeley.com/download/public/212462/2928549401/bcae9fcd73771e1d2dde11d3123f475178aca3fa/dl.pdf}
}
2009
(9)
Time-completeness trade-offs in record linkage using Adaptive Query Processing.
Lengu, R; Missier, P; Fernandes, A A A; Guerrini, G; and Mesiti, M
In Procs. EDBT, St. Petersburg, Russia, March 2009.
Paper
doi
link
bibtex
@inproceedings{Lengu09,
address = {St. Petersburg, Russia},
annote = {conference},
author = {Lengu, R and Missier, P and Fernandes, A A A and Guerrini, G and Mesiti, M},
booktitle = {Procs. EDBT},
doi = {http://dx.doi.org/10.1145/1516360.1516458},
file = {:Users/paolo/Documents/myGRID/refs/p299-missier.pdf:pdf},
keywords = {"Adaptive Query Processing","Record Linkage"},
month = mar,
title = {{Time-completeness trade-offs in record linkage using Adaptive Query Processing}},
url = {http://dx.doi.org/10.1145/1516360.1516458},
year = {2009}
}
A Comparison of Using Taverna and BPEL in Building Scientific Workflows: the case of caGrid.
Tan, W.; Missier, P.; Foster, I.; Madduri, R.; and Goble, C.
Concurrency and Computation Practice and Experience. 2009.
link
bibtex
@article{Tan:2009lk,
annote = {In press},
author = {Tan, Wei and Missier, Paolo and Foster, Ian and Madduri, Ravi and Goble, Carole},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Tan et al. - 2009 - A Comparison of Using Taverna and BPEL in Building Scientific Workflows the case of caGrid.pdf:pdf},
journal = {Concurrency and Computation Practice and Experience},
keywords = {scientific workflow BPEL Taverna},
title = {{A Comparison of Using Taverna and BPEL in Building Scientific Workflows: the case of caGrid}},
year = {2009}
}
Combining DHTs and SONs for Semantic-Based Service Discovery.
Pirro', G.; Missier, P.; Trunfio, P.; Talia, D.; Falace, G.; and Goble, C.
In Procs.International Conference on Intelligent System Design and Applications (ISDA'09), Pisa, Italy, November 2009.
Paper
link
bibtex
@inproceedings{Giuseppe-Pirr:2009oa,
address = {Pisa, Italy},
annote = {conference},
author = {Pirro', Giuseppe and Missier, Paolo and Trunfio, Paolo and Talia, Domenico and Falace, Gabriele and Goble, Carole},
booktitle = {Procs.International Conference on Intelligent System Design and Applications (ISDA'09)},
keywords = {"Distributed Systems"},
month = nov,
title = {{Combining DHTs and SONs for Semantic-Based Service Discovery}},
year = {2009},
urlpaper={http://www.mendeley.com/download/public/212462/858964852/fd27883c64acca89540c464ddd402b66d5fb3000/dl.pdf}
}
The Data Playground: An Intuitive Workflow Specification Environment.
Gibson, A; Gamble, M; Wolstencroft, K; Oinn, T; Goble, C; Belajjame, K; and Missier, P.
Future Generation Computer Systems, 25: 453–459. April 2009.
Paper
doi
link
bibtex
@article{A.-Gibson:2009ti,
annote = {journal},
author = {Gibson, A and Gamble, M and Wolstencroft, K and Oinn, T and Goble, C and Belajjame, K and Missier, Paolo},
doi = {http://dx.doi.org/10.1016/j.future.2008.09.009},
file = {:Users/paolo/Documents/myGRID/refs/p299-missier.pdf:pdf},
journal = {Future Generation Computer Systems},
keywords = {"Workflow Management","e-Science"},
month = apr,
pages = {453--459},
title = {{The Data Playground: An Intuitive Workflow Specification Environment}},
url = {http://dx.doi.org/10.1016/j.future.2008.09.009},
volume = {25},
year = {2009}
}
Semantically Annotated Provenance in the Life Science Grid.
Cao, B; Plale, B; Subramanian, G; Missier, P; Goble, C; and Simmhan, Y
In Freire, J.; Missier, P.; and Sahoo, S. S., editor(s), 1st International Workshop on the Role of Semantic Web in Provenance Management, 2009. CEUR Proceedings
Paper
link
bibtex
@inproceedings{Bin2009,
author = {Cao, B and Plale, B and Subramanian, G and Missier, P and Goble, C and Simmhan, Y},
booktitle = {1st International Workshop on the Role of Semantic Web in Provenance Management},
editor = {Freire, Juliana and Missier, Paolo and Sahoo, Satya S.},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Cao et al. - 2009 - Semantically Annotated Provenance in the Life Science Grid(2).pdf:pdf},
keywords = {\#provxg \#provenance,semantics provenance},
mendeley-tags = {\#provxg \#provenance},
publisher = {CEUR Proceedings},
title = {{Semantically Annotated Provenance in the Life Science Grid}},
url = {http://sunsite.informatik.rwth-aachen.de/Publications/CEUR-WS/Vol-526/},
year = {2009},
urlpaper={http://www.mendeley.com/download/public/212462/982701542/3a8950b4a6495a57e59ccc5166f84737f8dc9c4a/dl.pdf}
}
Medical Image Processing Workflow Support on the EGEE Grid with Taverna.
Maheshwari, K.; Missier, P.; Goble, C.; and Montagnat, J.
In Procs. conference of Computer Based Medical Systems (CBMS), Albuquerque, NM, USA, 2009.
Paper
link
bibtex
@inproceedings{Ketan-Maheshwari:jh,
address = {Albuquerque, NM, USA},
annote = {conference},
author = {Maheshwari, Ketan and Missier, Paolo and Goble, Carole and Montagnat, Johan},
booktitle = {Procs. conference of Computer Based Medical Systems (CBMS)},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Maheshwari et al. - 2009 - Medical Image Processing Workflow Support on the EGEE Grid with Taverna.pdf:pdf},
keywords = {"Workflow Management","e-Science"},
title = {{Medical Image Processing Workflow Support on the EGEE Grid with Taverna}},
year = {2009},
urlpaper={http://www.mendeley.com/download/public/212462/858962842/10cd50751591a44f384a80766f8165723ec21eda/dl.pdf}
}
.
Belhajjame, K.; Missier, P.; and Goble, C.
Data Provenance in Scientific Workflows. IGI Global, 2009.
link
bibtex
@inbook{Khalid-Belhajjame:2009ix,
annote = {chapter},
author = {Belhajjame, Khalid and Missier, Paolo and Goble, Carole},
booktitle = {Handbook of Research on Computational Grid Technologies for Life Sciences, Biomedicine, and Healthcare},
keywords = {"Workflow Management",Provenance},
publisher = {IGI Global},
title = {{Data Provenance in Scientific Workflows}},
year = {2009}
}
Incorporating Domain-Specific Information Quality Constraints into Database Queries.
Embury, S. M; Missier, P.; Sampaio, S.; Greenwood, R M.; and Preece, A. D
J. Data and Information Quality, 1(2). 2009.
Paper
link
bibtex
@article{DBLP:journals/jdiq/EmburyMSGP09,
author = {Embury, Suzanne M and Missier, Paolo and Sampaio, Sandra and Greenwood, R Mark and Preece, Alun D},
file = {:Users/paolo/Documents/myGRID/refs/Incorporating Domain-Specific Information Quality Constraints into Database Queries.pdf:pdf},
journal = {J. Data and Information Quality},
number = {2},
title = {{Incorporating Domain-Specific Information Quality Constraints into Database Queries}},
volume = {1},
year = {2009},
urlpaper={http://www.mendeley.com/download/public/212462/3145881381/2d9ae3771e97e17f7899676a5a98f57afd667fbe/dl.pdf}
}
2008
(7)
Building Scientific Workflow with Taverna and BPEL: a Comparative Study in caGrid.
Tan, W.; Missier, P.; Madduri, R.; and Foster, I.
In Procs.4th International workshop on Engineering Service-Oriented applications (WESOA), Sydney, Australia, December 2008.
Paper
doi
link
bibtex
@inproceedings{Wei-Tan:2008td,
address = {Sydney, Australia},
annote = {conference},
author = {Tan, Wei and Missier, Paolo and Madduri, Ravi and Foster, Ian},
booktitle = {Procs.4th International workshop on Engineering Service-Oriented applications (WESOA)},
doi = {http://dx.doi.org/10.1007/978-3-642-01247-1\_11},
keywords = {"Workflow Management ","e-Science"},
month = dec,
title = {{Building Scientific Workflow with Taverna and BPEL: a Comparative Study in caGrid}},
url = {http://dx.doi.org/10.1007/978-3-642-01247-1\_11},
year = {2008},
urlpaper={http://www.mendeley.com/download/public/212462/858964232/b62f315e67100842b89c100f95aac798e8b5ee9c/dl.pdf}
}
.
C.Goble D. De Roure, P.
Scientific Workflows. McGraw Hill, 2008.
link
bibtex
@inbook{C.Goble:2008hw,
annote = {chapter},
author = {{C.Goble D. De Roure}, P.Missier},
booktitle = {Yearbook of Science and Technology},
keywords = {"Workflow Management","e-Science"},
publisher = {McGraw Hill},
title = {{Scientific Workflows}},
year = {2008}
}
Exploiting provenance to make sense of automated data acceptance decisions in scientific workflows.
Missier, P.; Embury, S.; and Stapenhurst, R.
In IPAW, volume 5272/2008, of LNCS series, Salt Lake City, Utah, June 2008. Springer
Paper
doi
link
bibtex
@inproceedings{Paolo-Missier:2008zk,
address = {Salt Lake City, Utah},
annote = {DOI: http://dx.doi.org/10.1007/978-3-540-89965-5\_19
conference},
author = {Missier, Paolo and Embury, Suzanne and Stapenhurst, Richard},
booktitle = {IPAW},
doi = {http://dx.doi.org/10.1007/978-3-540-89965-5\_19},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Missier, Embury, Stapenhurst - 2008 - Exploiting provenance to make sense of automated data acceptance decisions in scientific workflows(4).pdf:pdf},
keywords = {"Information Quality Management",\#provxg \#provenance \#qurator,provenance data quality},
mendeley-tags = {\#provxg \#provenance \#qurator},
month = jun,
publisher = {Springer},
series = {LNCS series},
title = {{Exploiting provenance to make sense of automated data acceptance decisions in scientific workflows}},
url = {http://www.springerlink.com/content/r07524068770k401/},
volume = {5272/2008},
year = {2008},
urlpaper={http://www.mendeley.com/download/public/212462/836703912/38d5918a52b3c587f4c1d4e465bc068fa1a9cdc9/dl.pdf}
}
Data lineage model for Taverna workflows with lightweight annotation requirements.
Missier, P; Belhajjame, K; Zhao, J; and Goble, C
In IPAW, volume 5272/2008, of LNCS, Salt Lake City, US, June 2008. Springer
Paper
doi
link
bibtex
1 download
@inproceedings{missier-IPAW08a:,
address = {Salt Lake City, US},
annote = {DOI: http://dx.doi.org/10.1007/978-3-540-89965-5\_4
conference},
author = {Missier, P and Belhajjame, K and Zhao, J and Goble, C},
booktitle = {IPAW},
doi = {http://dx.doi.org/10.1007/978-3-540-89965-5\_4},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Missier et al. - 2008 - Data lineage model for Taverna workflows with lightweight annotation requirements(4).pdf:pdf},
keywords = {"Workflow Management ",\#provxg \#provenance,provenance lineage Taverna annotations},
mendeley-tags = {\#provxg \#provenance},
month = jun,
publisher = {Springer},
series = {LNCS},
title = {{Data lineage model for Taverna workflows with lightweight annotation requirements}},
url = {http://www.springerlink.com/content/36rw83153m0v171h/},
volume = {5272/2008},
year = {2008},
urlpaper={http://www.mendeley.com/download/public/212462/836704122/965a9eee272375793a11319f906cf7ae6dd001c4/dl.pdf}
}
2007
(7)
Taverna Workflows: Syntax and Semantics.
Turi, D; Missier, P; Roure, D D.; Goble, C; and Oinn, T
In Proceedings of the 3rd e-Science conference, Bangalore, India, December 2007.
Paper
doi
link
bibtex
@inproceedings{Turi07,
address = {Bangalore, India},
author = {Turi, D and Missier, P and Roure, D De and Goble, C and Oinn, T},
booktitle = {Proceedings of the 3rd e-Science conference},
doi = {http://dx.doi.org/10.1109/E-SCIENCE.2007.71},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Turi et al. - 2007 - Taverna Workflows Syntax and Semantics(5).pdf:pdf},
keywords = {"Language semantics","Workflow Management",Taverna},
month = dec,
title = {{Taverna Workflows: Syntax and Semantics}},
url = {http://dx.doi.org/10.1109/E-SCIENCE.2007.71},
year = {2007},
urlpaper={http://www.mendeley.com/download/public/212462/858964392/343a55b959d23b7c5a1e42b8fdd848f3bc8074ae/dl.pdf}
}
Accelerating Disease Gene Identification Through Integrated SNP Data Analysis.
Missier, P; Embury, S; Hedeler, C; Greenwood, M; Pennock, J; and Brass, A
In Proceedings 4th International Workshop on Data Integration in the Life Sciences, of LNBI, pages 215–230, 2007. Springer
Paper
doi
link
bibtex
2 downloads
@inproceedings{paolodils07,
author = {Missier, P and Embury, S and Hedeler, C and Greenwood, M and Pennock, J and Brass, A},
booktitle = {Proceedings 4th International Workshop on Data Integration in the Life Sciences},
doi = {http://dx.doi.org/10.1007/978-3-540-73255-6\_18},
keywords = {"Distributed Query Processing","Information Quality Management","Web Services",Biological Information Management},
pages = {215--230},
publisher = {Springer},
series = {LNBI},
title = {{Accelerating Disease Gene Identification Through Integrated SNP Data Analysis}},
url = {http://dx.doi.org/10.1007/978-3-540-73255-6\_18},
year = {2007}
}
.
Hedeler, C; and Missier, P
Quality management challenges in the post-genomic era. Artech House, 2007.
Paper
link
bibtex
@inbook{hedeler06:,
author = {Hedeler, C and Missier, P},
booktitle = {Database Modeling in Biology: Practices and Challenges},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Hedeler, Missier - 2007 - Database Modeling in Biology Practices and Challenges.pdf:pdf},
keywords = {"Biological Information Management","Information Quality Management"},
publisher = {Artech House},
title = {{Quality management challenges in the post-genomic era}},
year = {2007},
urlpaper={http://www.mendeley.com/download/public/212462/858964882/92edb6a59a8f3f1ef86695fd52a6c9655484d89a/dl.pdf}
}
Architectural patterns for the Semantic Grid.
Kotsiopoulos, I; Missier, P; Alper, P; Corcho, O; Bechhofer, S; and Goble, C
In Talia, D; A.Bilas; and Dikaiakos, M, editor(s), CoreGRID Institute on Knowledge and Data Management, Poznan Workshop, September 2005, volume XVIII, of CoreGRID Series, Knowledge and Data Management in GRIDs. Springer, 2007.
link
bibtex
@incollection{kotsiopoulos07:_archit_seman_grid,
author = {Kotsiopoulos, I and Missier, P and Alper, P and Corcho, O and Bechhofer, S and Goble, C},
booktitle = {CoreGRID Institute on Knowledge and Data Management, Poznan Workshop, September 2005},
editor = {Talia, D and A.Bilas and Dikaiakos, M},
keywords = {"Semantic Grid"},
publisher = {Springer},
series = {CoreGRID Series, Knowledge and Data Management in GRIDs},
title = {{Architectural patterns for the Semantic Grid}},
volume = {XVIII},
year = {2007}
}
2006
(6)
An overview of S-OGSA: A Reference Semantic Grid Architecture.
Corcho, O.; Alper, P.; Kotsiopoulos, I.; Missier, P.; Bechhofer, S.; and Goble, C.
Journal of Web Semantics, 4(2): 102–115. 2006.
Paper
doi
link
bibtex
abstract
@article{CORCHO2006,
abstract = {The Grid's vision, of sharing diverse resources in a flexible, coordinated and secure manner through dynamic formation and disbanding of virtual communities, strongly depends on metadata. Currently, Grid metadata is generated and used in an ad hoc fashion, much of it buried in the Grid middleware's code libraries and database schemas. This ad hoc expression and use of metadata causes chronic dependency on human intervention during the operation of Grid machinery, leading to systems which are brittle when faced with frequent syntactic changes in resource coordination and sharing protocols. The Semantic Grid is an extension of the Grid in which rich resource metadata is exposed and handled explicitly, and shared and managed via Grid protocols. The layering of an explicit semantic infrastructure over the Grid Infrastructure potentially leads to increased interoperability and greater flexibility. In recent years, several projects have embraced the Semantic Grid vision. However, the Semantic Grid lacks a Reference Architecture or any kind of systematic framework for designing Semantic Grid components or applications. The Open Grid Service Architecture (OGSA) aims to define a core set of capabilities and behaviours for Grid systems. We propose a Reference Architecture that extends OGSA to support the explicit handling of semantics, and defines the associated knowledge services to support a spectrum of service capabilities. Guided by a set of design principles, Semantic-OGSA (S-OGSA) defines a model, the capabilities and the mechanisms for the Semantic Grid. We conclude by highlighting the commonalities and differences that the proposed architecture has with respect to other Grid frameworks.},
author = {Corcho, Oscar and Alper, Pinar and Kotsiopoulos, Ioannis and Missier, Paolo and Bechhofer, Sean and Goble, Carole},
doi = {10.1016/j.websem.2006.03.001},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/CORCHO et al. - 2006 - An overview of S-OGSA A Reference Semantic Grid Architecture.pdf:pdf},
issn = {15708268},
journal = {Journal of Web Semantics},
keywords = {Architecture,Explicit metadata,Grid,Semantic Grid,Semantics},
number = {2},
pages = {102--115},
title = {{An overview of S-OGSA: A Reference Semantic Grid Architecture}},
url = {http://linkinghub.elsevier.com/retrieve/pii/S1570826806000059},
volume = {4},
year = {2006},
url={http://www.mendeley.com/download/public/212462/837355462/77dfb167a0b74918da2d3f9fb2ad72eb295e2260/dl.pdf}
}
The Grid's vision, of sharing diverse resources in a flexible, coordinated and secure manner through dynamic formation and disbanding of virtual communities, strongly depends on metadata. Currently, Grid metadata is generated and used in an ad hoc fashion, much of it buried in the Grid middleware's code libraries and database schemas. This ad hoc expression and use of metadata causes chronic dependency on human intervention during the operation of Grid machinery, leading to systems which are brittle when faced with frequent syntactic changes in resource coordination and sharing protocols. The Semantic Grid is an extension of the Grid in which rich resource metadata is exposed and handled explicitly, and shared and managed via Grid protocols. The layering of an explicit semantic infrastructure over the Grid Infrastructure potentially leads to increased interoperability and greater flexibility. In recent years, several projects have embraced the Semantic Grid vision. However, the Semantic Grid lacks a Reference Architecture or any kind of systematic framework for designing Semantic Grid components or applications. The Open Grid Service Architecture (OGSA) aims to define a core set of capabilities and behaviours for Grid systems. We propose a Reference Architecture that extends OGSA to support the explicit handling of semantics, and defines the associated knowledge services to support a spectrum of service capabilities. Guided by a set of design principles, Semantic-OGSA (S-OGSA) defines a model, the capabilities and the mechanisms for the Semantic Grid. We conclude by highlighting the commonalities and differences that the proposed architecture has with respect to other Grid frameworks.
Practical data quality certification: model, architecture, and experiences.
Missier, P; Oliaro, A; and Raffa, S
In IQIS, International Workshop on Information Quality in Information Systems, 30 June 2006, Chicago, USA (SIGMOD 2006 Workshop), 2006. ACM
Paper
link
bibtex
@inproceedings{missier06:_pract,
author = {Missier, P and Oliaro, A and Raffa, S},
booktitle = {IQIS, International Workshop on Information Quality in Information Systems, 30 June 2006, Chicago, USA (SIGMOD 2006 Workshop)},
file = {:Users/paolo/Library/Application Support/Mendeley Desktop/Downloaded/Missier, Oliaro, Raffa - 2006 - Practical data quality certification model, architecture, and experiences.pdf:pdf},
keywords = {"Data Quality","Information Quality Management"},
publisher = {ACM},
title = {{Practical data quality certification: model, architecture, and experiences}},
year = {2006},
urlpaper={http://www.mendeley.com/download/public/212462/858962792/1f186264788242cfea71ef0090f49ca20ba48935/dl.pdf}
}
Quality Views: Capturing and Exploiting the User Perspective on Data Quality.
Missier, P; Embury, S M; Greenwood, M; Preece, A D; and Jin, B
In Procs. VLDB, pages 977–988, Seoul, Korea, September 2006.
Paper
link
bibtex
@inproceedings{DBLP:conf/vldb/MissierEGPJ06,
address = {Seoul, Korea},
author = {Missier, P and Embury, S M and Greenwood, M and Preece, A D and Jin, B},
booktitle = {Procs. VLDB},
keywords = {"Information Quality Management","Workflow Management","e-Science",\#qurator},
mendeley-tags = {\#qurator},
month = sep,
pages = {977--988},
title = {{Quality Views: Capturing and Exploiting the User Perspective on Data Quality.}},
url = {http://www.vldb.org/conf/2006/p977-missier.pdf},
year = {2006}
}
2005
(6)
Provider issues in quality-constrained data provisioning.
Missier, P; and Embury, S M
In IQIS 2005, International Workshop on Information Quality in Information Systems, 17 June 2005, Baltimore, Maryland, USA (SIGMOD 2005 Workshop), pages 5–15, 2005.
Paper
doi
link
bibtex
@inproceedings{DBLP:conf/iqis/MissierE05,
author = {Missier, P and Embury, S M},
booktitle = {IQIS 2005, International Workshop on Information Quality in Information Systems, 17 June 2005, Baltimore, Maryland, USA (SIGMOD 2005 Workshop)},
doi = {http://dx.doi.org/10.1145/1077501.1077507},
keywords = {"Information Quality Management"},
pages = {5--15},
title = {{Provider issues in quality-constrained data provisioning}},
url = {http://dx.doi.org/10.1145/1077501.1077507},
year = {2005}
}
Clustering Web pages based on their structure.
Crescenzi, V.; Merialdo, P.; and Missier, P.
Data Knowl. Eng., 54: 279–299. 2005.
Paper
doi
link
bibtex
1 download
@article{DBLP:journals/dke/CrescenziMM05,
author = {Crescenzi, Valter and Merialdo, Paolo and Missier, Paolo},
doi = {http://dx.doi.org/10.1016/j.datak.2004.11.004},
journal = {Data Knowl. Eng.},
pages = {279--299},
title = {{Clustering Web pages based on their structure.}},
url = {http://dx.doi.org/10.1016/j.datak.2004.11.004},
volume = {54},
year = {2005}
}
Improving Government-to-business relationships through data reconciliation and process re-engineering.
Bertoletti, M.; Missier, P.; Scannapieco, M.; Aimetti, P.; and Batini, C.
In Advances in Management Information System Monograph Series, of Advances in Management Information System Monograph Series, 5. April 2005.
link
bibtex
@incollection{bertoletti05:_improv_gover,
annote = {ISBN: 0-7656-1133-3},
author = {Bertoletti, Marco and Missier, Paolo and Scannapieco, Monica and Aimetti, Pietro and Batini, Carlo},
booktitle = {Advances in Management Information System Monograph Series},
chapter = {5},
month = apr,
series = {Advances in Management Information System Monograph Series},
title = {{Improving Government-to-business relationships through data reconciliation and process re-engineering}},
year = {2005}
}
Data Quality at a Glance.
Scannapieco, M; Missier, P; and Batini, C
Datenbank-Spektrum, 14: 6–14. 2005.
Paper
link
bibtex
@article{DBLP:journals/dbsk/ScannapiecoMB05,
annote = {magazine},
author = {Scannapieco, M and Missier, P and Batini, C},
journal = {Datenbank-Spektrum},
keywords = {Data Quality},
pages = {6--14},
title = {{Data Quality at a Glance.}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.85.555\&rep=rep1\&type=pdf},
volume = {14},
year = {2005}
}
2004
(4)
An Automatic Data Grabber for Large Web Sites.
Crescenzi, V.; Mecca, G.; Merialdo, P.; and Missier, P.
In Procs. VLDB, pages 1321–1324, 2004.
Paper
link
bibtex
@inproceedings{DBLP:conf/vldb/CrescenziMMM04,
author = {Crescenzi, Valter and Mecca, Giansalvatore and Merialdo, Paolo and Missier, Paolo},
booktitle = {Procs. VLDB},
pages = {1321--1324},
title = {{An Automatic Data Grabber for Large Web Sites.}},
url = {http://www.vldb.org/conf/2004/DEMP18.PDF},
year = {2004}
}
QoS in Multichannel IS: The MAIS Approach.
Cappiello, C.; Missier, P.; Pernici, B.; Plebani, P.; and Batini, C.
In ICWE Workshops, pages 255–268, 2004.
Paper
link
bibtex
@inproceedings{DBLP:conf/icwe/CappielloMPPB04,
author = {Cappiello, Cinzia and Missier, Paolo and Pernici, Barbara and Plebani, Pierluigi and Batini, Carlo},
booktitle = {ICWE Workshops},
pages = {255--268},
title = {{QoS in Multichannel IS: The MAIS Approach.}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.122.3805\&rep=rep1\&type=pdf},
year = {2004}
}
Ontology-Based Question Answering in a Federation of University Sites: The MOSES Case Study.
Atzeni, P.; Basili, R.; Hansen, D H; Missier, P.; Paggio, P.; Pazienza, M. T.; and Zanzotto, F. M.
In Procs. NLDB, pages 413–420, 2004.
Paper
doi
link
bibtex
@inproceedings{DBLP:conf/nldb/AtzeniBHMPPZ04,
author = {Atzeni, Paolo and Basili, Roberto and Hansen, D H and Missier, Paolo and Paggio, Patrizia and Pazienza, Maria Teresa and Zanzotto, Fabio Massimo},
booktitle = {Procs. NLDB},
doi = {http://dx.doi.org/10.1007/b98754},
pages = {413--420},
title = {{Ontology-Based Question Answering in a Federation of University Sites: The MOSES Case Study.}},
url = {http://dx.doi.org/10.1007/b98754},
year = {2004}
}
2003
(6)
Fine-grain web site structure discovery.
Crescenzi, V.; Merialdo, P.; and Missier, P.
In Procs. WIDM, pages 15–22, 2003.
Paper
doi
link
bibtex
1 download
@inproceedings{DBLP:conf/widm/CrescenziMM03,
author = {Crescenzi, Valter and Merialdo, Paolo and Missier, Paolo},
booktitle = {Procs. WIDM},
doi = {http://dx.doi.org/10.1145/956699.956703},
pages = {15--22},
title = {{Fine-grain web site structure discovery.}},
url = {http://dx.doi.org/10.1145/956699.956703},
year = {2003}
}
The Service to Businesses Project: Improving Government-to-Business Relationships in Italy.
Bertoletti, M.; Missier, P.; Scannapieco, M.; Aimetti, P.; and Batini, C.
In Procs. EGOV, pages 468–471, 2003.
Paper
doi
link
bibtex
@inproceedings{DBLP:conf/egov/BertolettiMSAB03,
author = {Bertoletti, Marco and Missier, Paolo and Scannapieco, Monica and Aimetti, Pietro and Batini, Carlo},
booktitle = {Procs. EGOV},
doi = {http://dx.doi.org/10.1007/b11827},
pages = {468--471},
title = {{The Service to Businesses Project: Improving Government-to-Business Relationships in Italy.}},
url = {http://dx.doi.org/10.1007/b11827},
year = {2003}
}
Improving Data Quality in Practice: A Case Study in the Italian Public Administration.
Missier, P; Lalk, G; Verykios, V S; Grillo, F; Lorusso, T; and Angeletti, P
Distributed and Parallel Databases, 13: 135–160. 2003.
Paper
link
bibtex
@article{DBLP:journals/dpd/MissierLVGLA03,
author = {Missier, P and Lalk, G and Verykios, V S and Grillo, F and Lorusso, T and Angeletti, P},
journal = {Distributed and Parallel Databases},
pages = {135--160},
title = {{Improving Data Quality in Practice: A Case Study in the Italian Public Administration.}},
url = {http://www.springerlink.com/content/x80m0245551j0202/},
volume = {13},
year = {2003}
}
2001
(2)
Eguru: a decision support system for the assisted design of e-commerce architectures.
Missier, P; Bianchi, M; Zordan, A; and Umar, A
In Knowledge Management & Intelligent Enterprises - Industrial Volume. Procs. 9th IFIP 2.6 Working Conference on Database Semantics (DS-9), Hong Kong, April 2001.
Paper
link
bibtex
@inproceedings{p.missier01:_eguru,
address = {Hong Kong},
author = {Missier, P and Bianchi, M and Zordan, A and Umar, A},
booktitle = {Knowledge Management \& Intelligent Enterprises - Industrial Volume. Procs. 9th IFIP 2.6 Working Conference on Database Semantics (DS-9)},
month = apr,
title = {{Eguru: a decision support system for the assisted design of e-commerce architectures}},
url = {http://books.google.co.uk/books?hl=en\&lr=\&id=K8Z8GvYrbEcC\&oi=fnd\&pg=PA69\&dq=related:d41Mrh54DIgJ:scholar.google.com/\&ots=zF1CSZ73jM\&sig=pl3HEwWOt7sQ6HFWqWiNkR5YD5w\#v=onepage\&q=\&f=false},
year = {2001}
}
CitiTime: a system for rapid creation of portable next-generation telephony services.
Anjum, F.; Caruso, F.; Jain, R.; Missier, P.; and Zordan, A.
Computer Networks, 35: 579–595. 2001.
Paper
link
bibtex
@article{DBLP:journals/cn/AnjumCJMZ01,
author = {Anjum, Farooq and Caruso, Francesco and Jain, Ravi and Missier, Paolo and Zordan, Adalberto},
journal = {Computer Networks},
pages = {579--595},
title = {{CitiTime: a system for rapid creation of portable next-generation telephony services.}},
url = {http://dx.doi.org/10.1016/S1389-1286(00)00195-X},
volume = {35},
year = {2001}
}
2000
(3)
Java Call Control, Coordination and Transactions.
Jain, R; Anjum, F; Missier, P; and Shastry, S
IEEE Communications. January 2000.
Paper
link
bibtex
@article{jain00:,
author = {Jain, R and Anjum, F and Missier, P and Shastry, S},
journal = {IEEE Communications},
month = jan,
title = {{Java Call Control, Coordination and Transactions}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.16.8466\&rep=rep1\&type=pdf},
year = {2000}
}
A Knowledge-Based Decision Support Workbench for Advanced Ecommerce.
Umar, A.; Bianchi, M.; Caruso, F.; and Missier, P.
In AIWoRC, pages 93–100, 2000.
Paper
link
bibtex
@inproceedings{DBLP:conf/aiworc/UmarBCM00,
author = {Umar, Amjad and Bianchi, Michelle and Caruso, Francesco and Missier, Paolo},
booktitle = {AIWoRC},
pages = {93--100},
title = {{A Knowledge-Based Decision Support Workbench for Advanced Ecommerce.}},
url = {http://csdl.computer.org/comp/proceedings/aiworc/2000/0628/00/06280093abs.htm},
year = {2000}
}
1999
(4)
ChaiTime:A System for Rapid Creation of Portable Next-Generation Telephony Services Using Third-Party Software Components.
Anjum, F; Caruso, F; Jain, R; Missier, P; and Zordan, A
In Procs. Second IEEE Conference on Open Architectures and Network Programming (OpenArch), New York, March 1999.
Paper
doi
link
bibtex
@inproceedings{f.anjum99:,
address = {New York},
author = {Anjum, F and Caruso, F and Jain, R and Missier, P and Zordan, A},
booktitle = {Procs. Second IEEE Conference on Open Architectures and Network Programming (OpenArch)},
doi = {10.1109/OPNARC.1999.758431},
month = mar,
title = {{ChaiTime:A System for Rapid Creation of Portable Next-Generation Telephony Services Using Third-Party Software Components}},
url = {http://ieeexplore.ieee.org/xpl/freeabs\_all.jsp?arnumber=758431},
year = {1999}
}
A Knowledge-based Decision Support Workbench for Enterprise Resource Integration and Migration.
Umar, A; and Missier, P
In Procs. First International Workshop on Enterprise Management and Resource Planning Systems (EMRPS99), Venice, Italy, 1999.
link
bibtex
@inproceedings{umar99:,
address = {Venice, Italy},
author = {Umar, A and Missier, P},
booktitle = {Procs. First International Workshop on Enterprise Management and Resource Planning Systems (EMRPS99)},
title = {{A Knowledge-based Decision Support Workbench for Enterprise Resource Integration and Migration}},
year = {1999}
}
A Framework for Analyzing Virtual Enterprise Infrastructure.
Umar, A.; and Missier, P.
In RIDE, pages 4–11, 1999.
Paper
link
bibtex
@inproceedings{DBLP:conf/ride/UmarM99,
author = {Umar, Amjad and Missier, Paolo},
booktitle = {RIDE},
pages = {4--11},
title = {{A Framework for Analyzing Virtual Enterprise Infrastructure.}},
url = {http://computer.org/conferen/proceed/ride/0119/01190004abs.htm},
year = {1999}
}
1998
(2)
.
Missier, P; Rusinkiewicz, M; and Jin, W
Multidatabase Languages. Morgan Kauffman, 1998.
Paper
link
bibtex
1 download
@inbook{p.missier98b:,
author = {Missier, P and Rusinkiewicz, M and Jin, W},
booktitle = {Management of Heterogeneous and Autonomous Database Systems},
publisher = {Morgan Kauffman},
title = {{Multidatabase Languages}},
url = {http://books.google.co.uk/books?hl=en\&lr=\&id=BvuTyyMtGbAC\&oi=fnd\&pg=PA175\&dq=IFIP+Conference+Proceedings+1995+missier\&ots=ubXv\_IiwE0\&sig=fUxrDxeLlchFL9ckMRndXZRXH9E},
year = {1998}
}
.
Missier, P
Technology for the Copyright Protection of Digital Image. Scuola Normale Superiore di Pisa, Centro Ricerche Informatiche, 1998.
link
bibtex
@inbook{p.missier98:,
author = {Missier, P},
booktitle = {Monography Bullettin on archiving in Art History},
publisher = {Scuola Normale Superiore di Pisa, Centro Ricerche Informatiche},
title = {{Technology for the Copyright Protection of Digital Image}},
year = {1998}
}
1995
(2)
Providing Multidatabase Access - an Association Approach.
Missier, P; Rusinkiewicz, M; and Silberschatz, A
In Procs. 6th International Hong Kong Computer Society Database Workshop on Database Reengineering and Interoperability, Hong Kong, March 1995.
Paper
link
bibtex
@inproceedings{missier95:,
address = {Hong Kong},
author = {Missier, P and Rusinkiewicz, M and Silberschatz, A},
booktitle = {Procs. 6th International Hong Kong Computer Society Database Workshop on Database Reengineering and Interoperability},
month = mar,
title = {{Providing Multidatabase Access - an Association Approach}},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.4719},
year = {1995}
}
Extending a Multidatabase Manipulation Language to Resolve Schema and Data Conflicts.
Missier, P.; and Rusinkiewicz, M.
In DS-6, pages 93–115, 1995.
Paper
link
bibtex
1 download
@inproceedings{DBLP:conf/ds/MissierR95,
author = {Missier, Paolo and Rusinkiewicz, Marek},
booktitle = {DS-6},
pages = {93--115},
title = {{Extending a Multidatabase Manipulation Language to Resolve Schema and Data Conflicts.}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.1.5127\&rep=rep1\&type=pdf},
year = {1995}
}
1992
(1)
Semantic unification in the inference of union types.
Missier, P
In Procs. GULP'92 (Logic Programming), Tremezzo, Como, Italy, 1992.
link
bibtex
@inproceedings{missier92:_seman,
address = {Tremezzo, Como, Italy},
author = {Missier, P},
booktitle = {Procs. GULP'92 (Logic Programming)},
title = {{Semantic unification in the inference of union types}},
year = {1992}
}
|