rdf_optimise.pl

View source with raw comments or as raw

    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2010-2018, University of Amsterdam
    7                              VU University Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(rdf_optimise,
   37          [ rdf_optimise/1,             % :Query
   38            rdf_optimise/2,             % +Query, -Optimised
   39            rdf_optimise/4,             % +Query, -Optimised, -Space, -Time
   40            rdf_complexity/3,           % :Goal, -SpaceEstimate, -TimeEstimate
   41            serql_select_bind_null/2    % +Goal, -WithBind
   42          ]).   43:- use_module(library(semweb/rdf_db)).   44:- use_module(library(debug)).   45:- use_module(library(lists)).   46:- use_module(library(pairs)).   47:- use_module(library(ordsets)).   48:- use_module(library(ugraphs)).   49
   50:- meta_predicate
   51    rdf_optimise(0).   52
   53/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
   54Queries  as  returned  by  serql_compile_path/2    consists  of  a  path
   55expression which is compiled to a conjunction  of calls to rdf/3 and the
   56translation of the WHERE clause acting as an additional filter.
   57
   58Optimisation of a query basically means moving conditions into the rdf/3
   59calls where possible, moving other conditions   as  early as possibly in
   60the graph-matching and reordering graph matching calls (rdf/3) to reduce
   61non-determinism.
   62
   63Reordering
   64----------
   65
   66Reordering of graph expressions  is   required  to  reduce backtracking.
   67Roughly I see three approaches:
   68
   69        * Learning
   70        Create permutations of the query and make them run under time
   71        constraints.  Try to learn patterns that work (fast).
   72
   73        * Use statistics
   74        Given the number of solutions on a certain partially instantiated
   75        rdf/3 call (and the required execution time), reorganise them to
   76        minimalise the cost.
   77
   78        * Use constraint solving
   79        Instead of trying to solve an rdf/3 call, create a constraint from
   80        it and only try to solve it if there is more information.  This
   81        is especially attractive if some form of high-level reasoning
   82        from the language entailment rules can be applied or set-theory
   83        is a possibility.
   84
   85After experiments with using constraint solving,   this  module now uses
   86planning based on statistical information provided by the rdf_db module.
   87This  algorithm  reaches  optimal  performance  under  quite  reasonable
   88assumptions  while  the  planning  overhead   is  very  reasonable.  The
   89algorithm is described in "An  optimised   Semantic  Web  query language
   90implementation        in        Prolog",          available         from
   91http://www.swi-prolog.org/download/publications/ICLP05-SeRQL.pdf
   92
   93NOTES:
   94
   95        * SeRQL LIKE works on resources *and* literals.  Do we want this?
   96          See http://rdf4j.org/forum/mvnforum/viewthread?thread=275
   97- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
   98
   99:- multifile
  100    user:goal_expansion/2.  101
  102user:goal_expansion(rdf_complexity(G0, C), rdf_complexity(G, C)) :-
  103    expand_goal(G0, G).
  104user:goal_expansion(rdf_optimise(G0, O), rdf_optimise(G, O)) :-
  105    expand_goal(G0, G).
  106user:goal_expansion(rdf_optimise(G0, O, C, E), rdf_optimise(G, O, C, E)) :-
  107    expand_goal(G0, G).
  108
  109
  110/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  111Plan (conjunctions)
  112
  113        * Generate permutations and costs.  Select cheapest
  114        * The above moves tests right after the RDF call filling
  115          its input argument.  As a last optimisation some of these
  116          searches may be integrated in the rdf/3 call to help using
  117          indexing of the RDF database.
  118
  119complexity/2 needs to update the order of clauses inside meta calls
  120(notably optional path expressions).
  121- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

rdf_optimise(:Goal) is nondet

Optimise Goal and execute the result. Semantically equivalent to call/1.

To be done: - Module handling is not correct.

  130rdf_optimise(Module:Goal) :-
  131    rdf_optimise(Goal, Optimised),
  132    call(Module:Optimised).

rdf_optimise(+Goal, -Optimized) is det: Goal is a Prolog control-structure with calls to the rdf_db.pl and SeRQL runtime predicates. Optimized is a semantically equivalent goal, obtained by re-ordering conjunctions in Goal and processing sub-queries that do not share variables independently.

  143rdf_optimise(Conj, Optimised) :-
  144    rdf_optimise(Conj, Optimised, _, _).
  145
  146rdf_optimise(Conj, Optimised, Space, Time) :-
  147    debug(rdf_optimise, '*** OPTIMIZING ***~n', []),
  148    dbg_portray_body(Conj),
  149    term_variables(Conj, Vars),
  150    rdf_complexity(Conj, Conj, S0, E0),
  151    State = state(Vars-Conj, S0, E0, 1),
  152    debug(rdf_optimise, 'C0 = ~w~n', [E0]),
  153    (   reorder(Conj, Perm),
  154        (   debugging(rdf_optimise(all))
  155        ->  dbg_portray_body(Perm)
  156        ;   true
  157        ),
  158        rdf_complexity(Perm, Perm1, S, C),
  159        debug(rdf_optimise(all), '--> space=~w, time=~w~n', [S, C]),
  160
  161        arg(4, State, N),
  162        N2 is N + 1,
  163        nb_setarg(4, State, N2),
  164
  165        (   arg(3, State, C0),
  166            C < C0
  167        ->  debug(rdf_optimise,
  168                  'BETTER ONE [~D]: --> space=~w, time=~w~n', [N, S, C]),
  169            dbg_portray_body(Perm1),
  170            nb_setarg(3, State, C),
  171            nb_setarg(2, State, S),
  172            nb_setarg(1, State, Vars-Perm1)
  173        ;   true
  174        ),
  175        fail
  176    ;   arg(1, State, Vars-Optimised),
  177        arg(2, State, Space),
  178        arg(3, State, Time),
  179        debug(rdf_optimise, '  --> optimised: s/t = ~w/~w --> ~w/~w~n',
  180              [S0, E0, Space, Time]),
  181        dbg_portray_body(Optimised)
  182    ),
  183    !.
  184optimise_order(Conj, Conj, -1, -1) :-
  185    debug(rdf_optimise, 'Failed to optimise:~n', []),
  186    dbg_portray_body(Conj).
  187
  188
  189                 /*******************************
  190                 *           REORDERING         *
  191                 *******************************/

reorder(Goal, Reordered): Assuming Goal is a conjunction, create permutations of its order. Instead of blindly generating permutations however, we get an arbitrary element of the conjunction to the front and compute subgraphs of goals connected through variables after executing the first goal.

  201reorder(Goal, Reordered) :-
  202    State = bindings([]),
  203    conj_to_list(Goal, Conj0),
  204    reorder_conj(Conj0, State, Conj1),
  205    list_to_conj(Conj1, Reordered),
  206    arg(1, State, Bindings),
  207    unbind(Bindings).
  208
  209reorder_conj([One], _, [One]) :- !.
  210reorder_conj(List, State, Perm) :-
  211    group_by_cut(List, Before, Cut, After),
  212    !,
  213    reorder_conj(Before, State, PermBefore),
  214    bind_args(Before, State),               % this part is done
  215    reorder_conj(After, State, PermAfter),
  216    append(PermBefore, [Cut|PermAfter], Perm).
  217reorder_conj(List, State, Perm) :-
  218    group_by_optional(List, Normal, Optional),
  219    !,
  220    reorder_conj(Normal, State, PermNormal),
  221    bind_args(Normal, State),               % this part is done
  222    reorder_conj(Optional, State, PermOptional),
  223    append(PermNormal, PermOptional, Perm).
  224reorder_conj(List, State, [goal(_,independent(SubPerms),_)]) :-
  225    make_subgraphs(List, SubGraphs),
  226    SubGraphs \= [_],
  227    !,
  228    reorder_subgraph_conjs(SubGraphs, State, SubPerms).
  229reorder_conj(List, State, [Prefix|Perm]) :-
  230    select(Prefix, List, Rest),
  231    bind_args(Prefix, State),
  232    make_subgraphs(Rest, SubGraphs),
  233    (   SubGraphs = [SubGraph]
  234    ->  reorder_conj2(SubGraph, State, Perm)
  235    ;   Perm = [goal(_,independent(SubPerms),_)],
  236        reorder_subgraph_conjs(SubGraphs, State, SubPerms)
  237    ).

reorder_subgraph_conjs(SubGraphs, -ReorderedSubGraphs): Reorder the individual subgraphs. As we know these are independent there is no need to order the subgraphs themselves. we also know they are fully connected, and no longer contain cuts, so we use the simplified version of reorder_conj/2 called reorder_conj2/2.

  248reorder_subgraph_conjs([], _, []).
  249reorder_subgraph_conjs([H0|T0], State, [H|T]) :-
  250    reorder_conj2(H0, State, H1),
  251    list_to_conj(H1, H),
  252    reorder_subgraph_conjs(T0, State, T).
  253
  254reorder_conj2([One], _, [One]) :- !.
  255reorder_conj2(List, State, [Prefix|Perm]) :-
  256    select(Prefix, List, Rest),
  257    bind_args(Prefix, State),
  258    make_subgraphs(Rest, SubGraphs),
  259    (   SubGraphs = [SubGraph]
  260    ->  reorder_conj2(SubGraph, State, Perm)
  261    ;   Perm = [goal(_,independent(SubPerms),_)],
  262        reorder_subgraph_conjs(SubGraphs, State, SubPerms)
  263    ).

group_by_cut(+List, -BeforeCut, -Cut, -AfterCut): Split the conjunction over a cut (!) as we cannot guarantee proper behaviour when moving goals to the other side of a cut.

  270group_by_cut(Goals, Before, Cut, After) :-
  271    Cut = goal(_, !, _),
  272    append(Before, [Cut|After], Goals),
  273    !.

group_by_optional(+List, -NotOptional, -Optional): Split the conjunction in optional and non-optional part as we always want the optional part to happen last.

  281group_by_optional(List, Normal, Optional) :-
  282    split_optional(List, Normal, Optional),
  283    Normal \== [],
  284    Optional \== [].
  285
  286split_optional([], [], []).
  287split_optional([H|T0], Normal, Optional) :-
  288    (   optional(H)
  289    ->  Optional = [H|T],
  290        split_optional(T0, Normal, T)
  291    ;   Normal = [H|T],
  292        split_optional(T0, T, Optional)
  293    ).
  294
  295optional(G) :-
  296    goal(G, (_*->_;_)).

bind_args(Goal, !State): Bind all arguments in Goal or list of goals. Assumes that executing a goal grounds all its arguments. Only the goal A = literal(B), generated by optimising where constraints is handled special.
State is a term bindings(List) that is destructively maintained by instantiate/4.

  308bind_args([], _) :- !.
  309bind_args([H|T], State) :-
  310    !,
  311    bind_args(H, State),
  312    bind_args(T, State).
  313bind_args(H, State) :-
  314    goal(H, A=literal(B)),
  315    var(A), var(B),
  316    !,
  317    (   instantiated(A, I),
  318        I \== (-)
  319    ->  instantiate(B, _, I, State)
  320    ;   instantiated(B, I),
  321        I \== (-)
  322    ->  instantiate(A, _, I, State)
  323    ;   true
  324    ).
  325bind_args(Goal, State) :-
  326    vars(Goal, Vars),
  327    ground_vars(Vars, State).
  328
  329ground_vars([], _).
  330ground_vars([H|T], State) :-
  331    instantiate(H, _, r, State),
  332    ground_vars(T, State).

make_subgraphs(+Goals, -SubGraphs)

Create a list of connected subgraphs from Goals, assuming the variables in the assoc Grounded have been bound.

Arguments:

Goals - is a list goal(Id, Goal, Vars).

  342make_subgraphs([Goal], [[Goal]]) :- !.
  343make_subgraphs([G1,G2], Graphs) :-
  344    !,
  345    unbound_vars(G1, V1),
  346    unbound_vars(G2, V2),
  347    (   ord_intersect(V1, V2)
  348    ->  Graphs = [[G1,G2]]
  349    ;   Graphs = [[G1],[G2]]
  350    ).
  351make_subgraphs(Goals, SubGraphs) :-
  352    map_list_to_pairs(unbound_vars, Goals, UnBoundKeyed),
  353    connected_pairs(UnBoundKeyed, Edges),
  354    vertices_edges_to_ugraph(Goals, Edges, UGraph),
  355    connected_vertices(UGraph, SubGraphs).
  356
  357connected_pairs([], []).
  358connected_pairs([H|T], Edges) :-
  359    connected_pairs(T, H, Edges, EdgeTail),
  360    connected_pairs(T, EdgeTail).
  361
  362connected_pairs([], _, Edges, Edges).
  363connected_pairs([H|T], To, Edges, EdgeTail) :-
  364    (   connected(H, To, GH, GT)
  365    ->  Edges = [GH-GT,GT-GH|Edges1],
  366        connected_pairs(T, To, Edges1, EdgeTail)
  367    ;   connected_pairs(T, To, Edges, EdgeTail)
  368    ).
  369
  370connected(V1-G1, V2-G2, G1, G2) :-
  371    ord_intersect(V1, V2).
  372
  373connected_vertices([], []) :- !.
  374connected_vertices(UGraph, [Set1|Sets]) :-
  375    UGraph = [V1-_|_],
  376    reachable(V1, UGraph, Set1),
  377    del_vertices(UGraph, Set1, UGraph2),
  378    connected_vertices(UGraph2, Sets).

unbound_vars(+Goal, -Vars) is det: True when Vars is an ordered set of unbound variables in Goal.

  385unbound_vars(Goal, Vars) :-
  386    vars(Goal, AllVars),
  387    unbound(AllVars, Vars0),
  388    sort(Vars0, Vars).
  389
  390unbound([], []).
  391unbound([H|T0], [H|T]) :-
  392    instantiated(H, -),
  393    !,
  394    unbound(T0, T).
  395unbound([_|T0], T) :-
  396    unbound(T0, T).

conj_to_list(+Conj, -List)

Translate a conjunction into a list of elements of the format

! goal(Id, Goal, Vars)

Where Id is a goal identifier, Goal is the goal itself and Vars is a list of variables inside the Goal. Variables are sorted to the standard order of terms.

  408conj_to_list(Conj, List) :-
  409    phrase(conj_to_list(Conj, 1, _), List).
  410
  411conj_to_list((A,B), N0, N) -->
  412    !,
  413    conj_to_list(A, N0, N1),
  414    conj_to_list(B, N1, N).
  415conj_to_list(true, N, N) -->
  416    !,
  417    [].
  418conj_to_list(G, N0, N) -->
  419    { term_variables(G, Vars0),
  420      sort(Vars0, Vars),
  421      N is N0 + 1
  422    },
  423    [ goal(N0, G, Vars)
  424    ].

list_to_conj(+List, -Conj)

  430list_to_conj([], true).
  431list_to_conj([goal(_,G,_)], G) :- !.
  432list_to_conj([goal(_,G,_)|T0], (G,T)) :-
  433    list_to_conj(T0, T).

id(G, Id) is det
goal(G, Goal) is det
vars(G, Vars) is det: Extract fields from the goal structure.

  442%id(goal(Id, _, _),     Id).
  443goal(goal(_, Goal, _), Goal).
  444vars(goal(_, _, Vars), Vars).
  445
  446
  447                 /*******************************
  448                 *             BINDING          *
  449                 *******************************/
  450
  451/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  452Keep track of binding status of a   variable.  There are basically three
  453ways. One is to use a seperate   (assoc) table. Alternatively we can use
  454attributed variables and delete the attributes   at the end, and finally
  455we can use normal variables and recreate  the original goal by unbinding
  456them again.
  457
  458Using assocs requires us to pass  these   things  around  and involves a
  459log(N) complexity lookup. Using real terms  has the disadvantage that to
  460unbind we have to copy the term, thus  loosing bindings it may have with
  461the environment.  Using attributes suffers neither of these problems and
  462its only drawback is relying on non-standard Prolog features.
  463
  464Note that the remainder of the  algorithm   uses  sets  organised to the
  465standard order of terms.  As  putting   attributes  does  not change the
  466identity of global stack variables and goals are global stack terms this
  467is guaranteed.
  468- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

instantiate_obj(+Arg, -Old, +New, !Bindings) is det: Called to change the state of an RDF object after running some goal. Old is the current instantiation. After executing the RDF Goal the new instantiation is New (typically b). Bindings is a term bindings(List), which is updated using destructive assignment. List is the list of all variables to which we added attributes.

  479instantiate_obj(Arg, Old, New, Bindings) :-
  480    (   var(Arg)
  481    ;   ground(Arg)
  482    ),
  483    !,
  484    instantiate(Arg, Old, New, Bindings).
  485instantiate_obj(literal(Pattern, Var), +(Pattern), New, Bindings) :-
  486    instantiate(Var, _, New, Bindings).
  487
  488instantiate(Var, Old, New, Bindings) :-
  489    instantiated(Var, Old),
  490    (   Old == (-)
  491    ->  put_attr(Var, instantiated, New),
  492        arg(1, Bindings, B0),
  493        setarg(1, Bindings, [Var|B0])
  494    ;   true
  495    ).
  496
  497instantiated(Term, How) :-
  498    (   nonvar(Term)
  499    ->  How = +(+)
  500    ;   get_attr(Term, instantiated, H)
  501    ->  How = +(H)
  502    ;   How = -
  503    ).
  504
  505uninstantiate(Term, How) :-
  506    (   get_attr(Term, instantiated, How)
  507    ->  del_attr(Term, instantiated)
  508    ;   true
  509    ).

instantiate_unify(A, B, State) is det: We encounter a plain unification. Ideally, we determine the most specific binding and propagate this and execute the unification. The latter however must be undone if we evaluate a disjunction. For now, the code is somewhat simplified and we merely decide that if one side is instantiated, the other will be too, disregarding the actual value we might know.

  520instantiate_unify(A, B, State) :-
  521    instantiated(B, +(_)),
  522    !,
  523    instantiate(A, _, b, State).
  524instantiate_unify(A, B, State) :-
  525    instantiated(A, +(_)),
  526    !,
  527    instantiate(B, _, b, State).
  528instantiate_unify(_, _, _).

attr_unify_hook(+Attribute, +Value): For now, the attribute unify hook only allows unifying with a variable with the same attribute. This deals with the unification that takes place in rdf_optimise/3 for the variables of the saved copy.

  538instantiated:attr_unify_hook(Attr, Value) :-
  539    get_attr(Value, instantiated, Attr).
  540
  541instantiated:attr_portray_hook(Value, _Var) :-
  542    write(+(Value)).

instantiate_term(+Term): Instantiate all unbound variables

  548instantiate_term(Term, How) :-
  549    compound(Term),
  550    !,
  551    functor(Term, _, Arity),
  552    instantiate_args(Arity, Term, How).
  553instantiate_term(Term, How) :-
  554    (   var(Term),
  555        \+ get_attr(Term, instantiated, _)
  556    ->  put_attr(Term, instantiated, How)
  557    ;   true
  558    ).
  559
  560instantiate_args(0, _, _) :- !.
  561instantiate_args(N, Term, How) :-
  562    arg(N, Term, A),
  563    instantiate_term(A, How),
  564    N2 is N - 1,
  565    instantiate_args(N2, Term, How).

uninstantiate_term(+Term, +How): Remove all skolem instantiations from Term.

  572uninstantiate_term(Term, How) :-
  573    compound(Term),
  574    !,
  575    functor(Term, _, Arity),
  576    uninstantiate_args(Arity, Term, How).
  577uninstantiate_term(Term, How) :-
  578    uninstantiate(Term, How).
  579
  580uninstantiate_args(0, _, _) :- !.
  581uninstantiate_args(N, Term, How) :-
  582    arg(N, Term, A),
  583    uninstantiate_term(A, How),
  584    N2 is N - 1,
  585    uninstantiate_args(N2, Term, How).

delete_instantiated(+List, -Unbound): Delete all elements of List that are not instantiated (i.e. var and without an instantiated attribute).

  593delete_instantiated([], []).
  594delete_instantiated([H|T0], L) :-
  595    (   instantiated(H, -)
  596    ->  L = [H|T],
  597        delete_instantiated(T0, T)
  598    ;   delete_instantiated(T0, L)
  599    ).
  600
  601
  602                 /*******************************
  603                 *          COMPLEXITY          *
  604                 *******************************/

rdf_complexity(+GoalIn, -GoalOut, -SpaceEstimate, -TimeEstimate)

Provide an estimate for the size of the search space for executing Goal. For this we estimate the branching factor of each subgoal in the conjunction. If the branching factors are B0, B1, ... then the total complexity estimate is

E = 1 + B0 + B0*B1 + B0*B1*B2, ...

Non-RDF calls are supposed to be boolean tests that can be executed at the first opportunity all arguments are bound by RDF calls. They have a probability of failure, reducing the search space. Using the above formula, any number lower than 1 moves the test as far as possible to the head of the conjunction.

If GoalIn and GoalOut are the same the system will not try to optimize local conjunctions.

ISSUES: control structures ;, if-then-else, etc.

  626rdf_complexity(Goal, SpaceEstimate, TimeEstimate) :-
  627    rdf_complexity(Goal, Goal, SpaceEstimate, TimeEstimate).
  628
  629rdf_complexity(Goal0, Goal, Space, Time) :-
  630    State = bindings([]),
  631    complexity(Goal0, Goal, State, 1, Space, 0, Time),
  632    arg(1, State, Bindings),
  633    unbind(Bindings).
  634
  635unbind([]).
  636unbind([H|T]) :-
  637    del_attr(H, instantiated),
  638    unbind(T).
  639
  640%       complexity(:GoalIn, -GoalOut,
  641%                  +State,
  642%                  +SpaceIn, -SpaceOut,
  643%                  +CountIn, -CountOut)
  644%
  645%       Compute the complexity of Goal.  Vars   is  an assoc holding the
  646%       variables bound earlier in the conjunction. Space keeps the size
  647%       of the search space and Count is   the  cummulative count of the
  648%       costs for exploring the search space  espressed in the number of
  649%       nodes that will be visited.
  650%
  651%       The (G*->_=true;_=false) clause deals with   the  code generated
  652%       from optional graph specs as provided by SeRQL.
  653
  654complexity((A0,B0), (A,B), State, Sz0, Sz, C0, C) :-
  655    !,
  656    complexity(A0, A, State, Sz0, Sz1, C0, C1),
  657    complexity(B0, B, State, Sz1, Sz, C1, C).
  658complexity((G0*->True;False),
  659           ( G*->True;False), State, Sz0, Sz, C0, C) :-
  660    !,
  661    (   var(G)
  662    ->  optimise_order(G0, G, Sz1, C1),
  663        Sz is Sz0 * Sz1,
  664        C is C0+Sz0*C1
  665    ;   complexity(G, G, State, Sz0, Sz, C0, C)
  666    ).
  667complexity((If0->Then0;Else0),          % dubious
  668           ( If->Then; Else), State, Sz0, Sz, C0, C) :-
  669    !,
  670    (   var(If)
  671    ->  optimise_order(If0,   If,   Sz1, C1),
  672        optimise_order(Then0, Then, Sz2, C2),
  673        optimise_order(Else0, Else, Sz3, C3),
  674        Sz is max(Sz0 * Sz1 * Sz2, Sz0 * Sz3),
  675        C  is C0 + max(Sz0*C1+Sz0*Sz1*C2, Sz0*C1+Sz0*Sz1*C3)
  676    ;   complexity(If,   If,   State, Sz0, Sz1, C0, C1),
  677        complexity(Then, Then, State, Sz1, Sz2, C1, C2),
  678        complexity(Else, Else, State, Sz0, Sz3, C0, C3),
  679        Sz is max(Sz2, Sz3),
  680        C  is max(C2, C3)
  681    ).
  682complexity((A0;B0), (A;B), State, Sz0, Sz, C0, C) :-
  683    !,
  684    (   var(A)
  685    ->  optimise_order(A0, A, _, _),        % First try the cheap one?
  686        optimise_order(B0, B, _, _)
  687    ;   A = A0,
  688        B = B0
  689    ),
  690    complexity(A, A, State, Sz0, SzA, C0, CA),
  691    complexity(B, B, State, Sz0, SzB, C0, CB),
  692    Sz is SzA + SzB,
  693    C is CA + CB.
  694complexity(sparql_group(G0), sparql_group(G), State, Sz0, Sz, C0, C) :-
  695    !,
  696    (   var(G)
  697    ->  rdf_optimise(G0, G, Sz1, C1),
  698        Sz is Sz0 * Sz1,
  699        C is C0+Sz0*C1
  700    ;   complexity(G, G, State, Sz0, Sz, C0, C)
  701    ).
  702complexity(sparql_group(G0, OV, IV),
  703           sparql_group(G, OV, IV),
  704           State, Sz0, Sz, C0, C) :-
  705    !,
  706    (   var(G)
  707    ->  rdf_optimise(G0, G, Sz1, C1),
  708        Sz is Sz0 * Sz1,
  709        C is C0+Sz0*C1
  710    ;   complexity(G, G, State, Sz0, Sz, C0, C)
  711    ).
  712complexity(rdfql_carthesian(Bags),
  713           rdfql_carthesian(Bags), State, Sz0, Sz, C0, C) :-
  714    !,
  715    carth_complexity(Bags, State, Sz0, Sz, C0, 0, C).
  716complexity(independent(Goals0), Goal, State, Sz0, Sz, C0, C) :-
  717    !,
  718    independent_complexity(Goals0, Goal, State, Sz0, Sz, C0, 0, C).
  719complexity(Goal, Goal, State, Sz0, Sz, C0, C) :-
  720    Goal = member(Var, List),      % List is list of resources
  721    !,
  722    instantiate(Var, _, b, State),
  723    length(List, Branch),
  724    Sz is Sz0 * Branch,
  725    C is C0 + Sz0*0.2 + Sz*0.2.
  726complexity(Goal, Goal, State, Sz, Sz, C0, C) :-
  727    Goal = (A=B),
  728    !,
  729    instantiate_unify(A, B, State),
  730    C is C0 + 0.2.
  731complexity(Goal, Goal, State, Sz, Sz, C0, C) :-
  732    Goal = (Var=literal(V)),
  733    !,
  734    instantiated(V, +(_)),
  735    instantiate(Var, _, b, State),
  736    C is C0 + 0.2.
  737complexity(Goal, Goal, State, Sz0, Sz, C0, C) :-
  738    rdf_db_goal(Goal, S, P, O),
  739    !,
  740    instantiate(S, SI, b, State),
  741    instantiate(P, PI, b, State),
  742    instantiate_obj(O, OI, b, State),
  743    complexity0(SI, PI, OI, P, Goal, SetUp, PerAlt, Branch),
  744    Sz is Sz0 * Branch,
  745    C is C0 + Sz0*SetUp + Sz*PerAlt,
  746    debug(rdf_optimise(complexity), 'Complexity ~p: (~w) ~w --> ~w',
  747          [i(SI,PI,OI), Goal, Branch, C]).
  748complexity(sparql_eval(E,V), sparql_eval(E,V), _, Sz0, Sz, C0, C) :-
  749    !,
  750    term_variables(E, Vars),
  751    all_bound(Vars),
  752    Sz is Sz0,                      % probability of failure
  753    C is C0 + Sz*20.                % Sz * CostOfEval
  754complexity(sparql_true(E), sparql_true(E), _, Sz0, Sz, C0, C) :-
  755    !,
  756    term_variables(E, Vars),
  757    all_bound(Vars),
  758    Sz is Sz0 * 0.5,                % probability of failure
  759    C is C0 + Sz0*20.               % Sz * CostOfEval
  760complexity(G, G, _, Sz0, Sz, C0, C) :-  % non-rdf tests
  761    term_variables(G, Vars),
  762    all_bound(Vars),
  763    Sz is Sz0 * 0.5,                % probability of failure
  764    C is C0 + Sz0.                  % Sz * CostOfTest
  765
  766all_bound([]).
  767all_bound([H|T]) :-
  768    instantiated(H, +(_)),
  769    all_bound(T).

carth_complexity(+Bags, +State, +Size0, -Size, +Time0, +TimeSum0, -TimeSum) is det: Compute the time and space efficiency of the carthesian product. the total cost in time is the sum of the costs of all branches, The search space at the end is still the same product.

  779carth_complexity([], _, Sz, Sz, _, C, C).
  780carth_complexity([bag(_,G)|T], State,
  781                 Sz0, Sz,
  782                 C0, Csum0, Csumz) :-
  783    complexity(G, G, State, Sz0, Sz1, C0, C1),
  784    Csum1 is Csum0 + C1,
  785    carth_complexity(T, State, Sz1, Sz, C0, Csum1, Csumz).

independent_complexity(+GoalsIn, -Goals, +State, +Size0, -Size, +Time0, +TimeSum0, -TimeSum) is det

Compute the complexity of an independent conjunction.

Arguments:

Goals - is a list of g(Goal, Branch, Cost), where Branch is the number of solutions expected fromGoal and Cost is the estimated CPU time.

  798independent_complexity(GoalsIn, Goal, State,
  799                       Size0, Size,
  800                       Time0, TimeSum0, TimeSum) :-
  801    indep_complexity(GoalsIn, Goals1, State,
  802                     Size0, Size,
  803                     Time0, TimeSum0, TimeSum),
  804    keysort(Goals1, ByCost),
  805    pairs_values(ByCost, Goals2),
  806    simplify_carthesian(Goals2, Goal).
  807
  808indep_complexity([], [], _, Sz, Sz, _, C, C).
  809indep_complexity([G0|GT0], [SzG-bag(Vars,G,SzG,CG)|GT], State,
  810                       Sz0, Sz,
  811                       C0, Csum0, Csumz) :-
  812    complexity(G0, G, State, Sz0, Sz1, C0, C1),
  813    term_variables(G, VList),
  814    Vars =.. [v|VList],
  815    (   Sz0 =:= 0
  816    ->  SzG = 0,
  817        CG = 0
  818    ;   SzG is Sz1/Sz0,
  819        CG is (C1-C0)/Sz0
  820    ),
  821    Csum1 is Csum0 + C1,
  822    indep_complexity(GT0, GT, State, Sz1, Sz, C0, Csum1, Csumz).

simplify_carthesian(+Bags, -Goal) is det: Peer of the leading little branching goals from the carthesian handler. If there is a bag of one left, turn it into a normal goal.

  830simplify_carthesian([], true).
  831simplify_carthesian([bag(_,Goal,_Branch,_Cost)], Goal) :- !.
  832simplify_carthesian([bag(_,Goal,Branch,_Cost)|Bags], Final) :-
  833    (   Branch < 1.5
  834    ->  !, Final = (Goal, Final1),
  835        simplify_carthesian(Bags, Final1)
  836    ;   Bags == []
  837    ->  !, Final = Goal
  838    ).
  839simplify_carthesian(Bags, rdfql_carthesian(Bags)).

complexity0(+SI, +PI, +OI, +P, +G, -Setup, -PerAlt, -Branch)

SI,PI,OI describe the instantiation pattern. P is the predicate and G is the actual goal. Complexity is unified to an estimate of the size of the search space and therefore an estimate of the execution time required to prove the goal.

Literal `like' matches come out as +(like(Pattern)). We must estimate the percentage of the literals that match this pattern. Suppose the factor is 1,000. This means the branching is reduced by 1,000, but finding each solution is slow as it requires a linear scan. It is faster than going all the way back to Prolog backtracking however, so we estimate a factor 10 (TBD: verify that number).

ISSUES: rdf_has/3 vs rdf_reachable/3.

  859complexity0(+(_),+(_),+(_), _, _, 1, 0, 1) :- !.
  860complexity0(+(b),+(+),-, P, G, 1, 1, B) :-
  861    !,
  862    subj_branch_factor(G, B, Prop),
  863    rdf_predicate_property(P, Prop).
  864complexity0(-,+(+),+(b), P, G, 1, 1, B) :-
  865    !,
  866    obj_branch_factor(G, B, Prop),
  867    rdf_predicate_property(P, Prop).
  868complexity0(+(b), -, -, _, _, 1, 1, B) :-
  869    !,
  870    rdf_statistics(triples(Total)),
  871    subject_count(Subjs),
  872    (   Total == 0
  873    ->  B = 0
  874    ;   B is Total/Subjs
  875    ).
  876complexity0(_,_,+(like(Pat)),_, G, Factor, Factor, B) :-
  877    !,
  878    rdf_estimate_complexity(G, B0),
  879    pattern_filter(Pat, Factor0),
  880    Factor is max(1, min(B0, Factor0)/10),
  881    B is B0/Factor.
  882complexity0(_,_,_, _, G, 1, 1, B) :-
  883    rdf_estimate_complexity(G, B).
  884
  885:- if(rdf_statistics(subjects(_))).  886subject_count(Count) :-                         % RDF-DB 2.x
  887    rdf_statistics(subjects(Count)).
  888:- else.  889subject_count(Count) :-                         % RDF-DB 3.x
  890    rdf_statistics(resources(Count)).
  891:- endif.  892
  893:- multifile
  894    subj_branch_factor/3,
  895    obj_branch_factor/3.  896
  897subj_branch_factor(rdf(_,_,_),           X, rdf_subject_branch_factor(X)).
  898subj_branch_factor(rdf_has(_,_,_),       X, rdfs_subject_branch_factor(X)).
  899subj_branch_factor(rdf_reachable(_,_,_), X, rdfs_subject_branch_factor(X)).
  900
  901obj_branch_factor(rdf(_,_,_),            X, rdf_object_branch_factor(X)).
  902obj_branch_factor(rdf_has(_,_,_),        X, rdfs_object_branch_factor(X)).
  903obj_branch_factor(rdf_reachable(_,_,_),  X, rdfs_object_branch_factor(X)).

rdf_db_goal(+Goal, -Subject, -Predicate, -Object)

True if Goal is a pure (logical) predicate on the RDF database involving the given Subject, Predicate and Object. Defined multifile, allowing the optimiser to understand user-defined rdf/3 like predicates.

To be done: - Allow specifying different costs and branching factors

  915:- multifile
  916    rdf_db_goal/4.  917
  918rdf_db_goal(rdf(S,P,O),                 S,P,O).
  919rdf_db_goal(rdf_has(S,P,O),             S,P,O).
  920rdf_db_goal(rdf_reachable(S,P,O),       S,P,O).
  921rdf_db_goal(rdf(S,P,O, _DB),            S,P,O). % TBD: less hits

pattern_filter(+Like, -Factor): Estimate the efficiency of a pattern. This is a bit hard, as characters are not independent.

  928pattern_filter(Like, Factor) :-
  929    atom_codes(Like, Codes),
  930    pattern_factor(Codes, 1, Factor).
  931
  932pattern_factor([], F, F).
  933pattern_factor([0'*|T], F0, F) :-
  934    !,
  935    pattern_factor(T, F0, F).
  936pattern_factor([_|T], F0, F) :-
  937    F1 is F0*10,
  938    pattern_factor(T, F1, F).

rdf_estimate_complexity(+Goal, -Complexity): Estimate the branching factor introduced by Goal. This uses the rdf_db statistics of the hash chains which are based on exploiting the RDFS subPropertyOf reasoning.
In addition, rdf_reachable/3 introduces its own complexity which must be estimate using the branching factor of the relation.

  950rdf_estimate_complexity(G, C) :-
  951    rdf_db_goal(G, S, P0, O),
  952    map_predicate(P0, P),
  953    rdf_estimate_complexity(S, P, O, C).
  954
  955map(map_predicate(_,_)).
  956map(map_predicate(_,_):-_).
  957
  958term_expansion(In, Out) :-
  959    map(In),
  960    !,
  961    rdf_global_term(In, Out).
  962
  963map_predicate(X, X) :-
  964    var(X),
  965    !.
  966map_predicate(serql:directSubClassOf,    rdfs:subClassOf) :- !.
  967map_predicate(serql:directType,          rdf:type) :- !.
  968map_predicate(serql:directSubPropertyOf, rdfs:subPropertyOf) :- !.
  969map_predicate(X, X).
  970
  971
  972                 /*******************************
  973                 *     INSTANTIATE OPTIONAL     *
  974                 *******************************/
  975
  976/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  977In  SELECT  queries,  optional  parts  of   the  path  expression  leave
  978uninstantiated variables. These must be bound to  '$null$' to be able to
  979do correct merging for  DISTINCT.  The  naive   way  to  do  this  is to
  980instantiate all variables at the end  of   the  query.  On large selects
  981(i.e. involving many variables) this appears to be quite costly.  Doing
  982the job early, as in
  983
  984        (   Goal
  985        *-> true
  986        ;   bind_null(VarsInGoal)
  987        )
  988
  989is not correct as well, as VarsInGoal may  be involved in other parts of
  990the code either before or after the optional path expression. So we need
  991to:
  992
  993        * Do abtract execution and find the bindings done before arriving
  994          at Goal.
  995        * continue the execution, and watch for new bindings to these
  996          variables.  If we find a binding for the second time, remove
  997          it from the first and make a conditional binding for it.
  998
  999If we bind an argument unconditionally, we place an attribute 'b'. If it
 1000is conditionally bound, we place an attribute   c(Set), where Set is the
 1001set in which it was bound or plain  'c' if it was conditionally bound in
 1002multiple places.
 1003
 1004TBD:    disjunctions and other control structures.
 1005        queries that do not bind (such as SPARQL bound(X))
 1006- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
 1007
 1008serql_select_bind_null(Goal0, Goal) :-
 1009    State = bindings([]),
 1010    select_bind_null(Goal0, Goal1, State),
 1011    arg(1, State, Bindings),
 1012    c_unbind(Bindings, Left),
 1013    (   Left == []
 1014    ->  Goal = Goal1
 1015    ;   Goal = (Goal1, rdfql_cond_bind_null(Left))
 1016    ).
 1017
 1018c_unbind([], []).
 1019c_unbind([H|T0], L) :-
 1020    (   get_attr(H, instantiated, c)
 1021    ->  L = [H|T]
 1022    ;   L = T
 1023    ),
 1024    del_attr(H, instantiated),
 1025    c_unbind(T0, T).
 1026
 1027
 1028select_bind_null((A0,B0), (A,B), State) :-
 1029    !,
 1030    select_bind_null(A0, A, State),
 1031    select_bind_null(B0, B, State).
 1032select_bind_null((G0 *-> true; true),
 1033                 ( G *-> true; Bind),
 1034                 State) :-
 1035    !,
 1036    arg(1, State, B0),
 1037    select_bind_null(G0, G, State),
 1038    arg(1, State, B),
 1039    Bind = rdfql_bind_null(Vars),
 1040    c_bindings(B, B0, c(Bind), Vars).
 1041select_bind_null(rdfql_carthesian(List0),
 1042                 rdfql_carthesian(List), State) :-
 1043    !,
 1044    select_carth_bind_null(List0, List, State).
 1045select_bind_null(sparql_group(A0), sparql_group(A), State) :-
 1046    !,
 1047    select_bind_null(A0, A, State).
 1048select_bind_null(sparql_group(A0,Outer,Inner),
 1049                 sparql_group(A, Outer,Inner),
 1050                 State) :-
 1051    !,
 1052    select_bind_null(A0, A, State),
 1053    term_variables(Outer, Vars),
 1054    c_bind(Vars, State).
 1055select_bind_null(Goal, Goal, State) :-
 1056    term_variables(Goal, Vars),
 1057    c_bind(Vars, State).

c_bindings(+AtEnd, +AtStart, +CVars, -Vars)

The variables of the difference-list AtEnd..AtStart are conditionally bound. Tag each such variable with CVars.

Arguments:

CVars - Term c(Vars), where Vars are the other variables that have the same conditional binding.

 1067c_bindings(B, B0, _, Vars)  :-
 1068    B == B0,
 1069    !,
 1070    Vars = [].
 1071c_bindings([H|T0], B0, Attr, [H|Vars]) :-
 1072    get_attr(H, instantiated, I),
 1073    is_instantiated(I),
 1074    !,
 1075    put_attr(H, instantiated, Attr),
 1076    c_bindings(T0, B0, Attr, Vars).
 1077c_bindings([_|T0], B0, Attr, Vars) :-
 1078    c_bindings(T0, B0, Attr, Vars).
 1079
 1080
 1081is_instantiated(b).             % unconditionally bound
 1082is_instantiated(c(_)).          % bound either by call or rdfql_bind_null/1

c_bind(+Vars, +State): Do unconditional binding of Vars. Var may be in a rdfql_bind_null/1 set. In that case, delete it from the set, and set the class to 'c' to make a conditional binding at the end.

 1091c_bind([], _).
 1092c_bind([H|T], State) :-
 1093    (   get_attr(H, instantiated, I)
 1094    ->  (   I == b                  % already unconditionally bound
 1095        ->  true
 1096        ;   I = c(Set)
 1097        ->  arg(1, Set, Vars0),
 1098            del_var(H, Vars0, Vars),
 1099            setarg(1, Set, Vars),
 1100            put_attr(H, instantiated, c)
 1101        ;   I == c
 1102        ->  true
 1103        )
 1104    ;   put_attr(H, instantiated, b),
 1105        arg(1, State, B0),
 1106        setarg(1, State, [H|B0])
 1107    ),
 1108    c_bind(T, State).
 1109
 1110del_var(H, [X|T0], T) :-
 1111    (   H == X
 1112    ->  T = T0
 1113    ;   T = [X|T1],
 1114        del_var(H, T0, T1)
 1115    ).
 1116
 1117select_carth_bind_null([], [], _).
 1118select_carth_bind_null([bag(Vars, G0)|T0], [bag(Vars, G)|T], State) :-
 1119    select_bind_null(G0, G, State),
 1120    select_carth_bind_null(T0, T, State).
 1121
 1122
 1123                 /*******************************
 1124                 *        DEBUG SUPPORT         *
 1125                 *******************************/
 1126
 1127dbg_portray_body(G) :-
 1128    debugging(rdf_optimise),
 1129    !,
 1130    portray_body(G).
 1131dbg_portray_body(_).
 1132
 1133portray_body(G) :-
 1134    (   pp_instantiate_term(G),
 1135        debug(_, '~@',
 1136              [ portray_clause(current_output, (<> :- G), [module(sparql_runtime)])
 1137              ]),
 1138        fail
 1139    ;   true
 1140    ).
 1141
 1142pp_instantiate_term(Term) :-
 1143    compound(Term),
 1144    !,
 1145    functor(Term, _, Arity),
 1146    pp_instantiate_args(Arity, Term).
 1147pp_instantiate_term(Term) :-
 1148    var(Term),
 1149    get_attr(Term, instantiated, H),
 1150    !,
 1151    del_attr(Term, instantiated),
 1152    Term = +(H).
 1153pp_instantiate_term(_).
 1154
 1155pp_instantiate_args(0, _) :- !.
 1156pp_instantiate_args(N, Term) :-
 1157    arg(N, Term, A),
 1158    pp_instantiate_term(A),
 1159    N2 is N - 1,
 1160    pp_instantiate_args(N2, Term).
 1161
 1162
 1163                 /*******************************
 1164                 *            SANDBOX           *
 1165                 *******************************/
 1166
 1167:- multifile sandbox:safe_meta_predicate/1. 1168
 1169sandbox:safe_meta_predicate(rdf_optimise:rdf_optimise/1)