1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2010-2018, University of Amsterdam 7 VU University Amsterdam 8 All rights reserved. 9 10 Redistribution and use in source and binary forms, with or without 11 modification, are permitted provided that the following conditions 12 are met: 13 14 1. Redistributions of source code must retain the above copyright 15 notice, this list of conditions and the following disclaimer. 16 17 2. Redistributions in binary form must reproduce the above copyright 18 notice, this list of conditions and the following disclaimer in 19 the documentation and/or other materials provided with the 20 distribution. 21 22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 30 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 32 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33 POSSIBILITY OF SUCH DAMAGE. 34*/ 35 36:- module(rdf_optimise, 37 [ rdf_optimise/1, % :Query 38 rdf_optimise/2, % +Query, -Optimised 39 rdf_optimise/4, % +Query, -Optimised, -Space, -Time 40 rdf_complexity/3, % :Goal, -SpaceEstimate, -TimeEstimate 41 serql_select_bind_null/2 % +Goal, -WithBind 42 ]). 43:- use_module(library(semweb/rdf_db)). 44:- use_module(library(debug)). 45:- use_module(library(lists)). 46:- use_module(library(pairs)). 47:- use_module(library(ordsets)). 48:- use_module(library(ugraphs)). 49 50:- meta_predicate 51 rdf_optimise( ). 52 53/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 54Queries as returned by serql_compile_path/2 consists of a path 55expression which is compiled to a conjunction of calls to rdf/3 and the 56translation of the WHERE clause acting as an additional filter. 57 58Optimisation of a query basically means moving conditions into the rdf/3 59calls where possible, moving other conditions as early as possibly in 60the graph-matching and reordering graph matching calls (rdf/3) to reduce 61non-determinism. 62 63Reordering 64---------- 65 66Reordering of graph expressions is required to reduce backtracking. 67Roughly I see three approaches: 68 69 * Learning 70 Create permutations of the query and make them run under time 71 constraints. Try to learn patterns that work (fast). 72 73 * Use statistics 74 Given the number of solutions on a certain partially instantiated 75 rdf/3 call (and the required execution time), reorganise them to 76 minimalise the cost. 77 78 * Use constraint solving 79 Instead of trying to solve an rdf/3 call, create a constraint from 80 it and only try to solve it if there is more information. This 81 is especially attractive if some form of high-level reasoning 82 from the language entailment rules can be applied or set-theory 83 is a possibility. 84 85After experiments with using constraint solving, this module now uses 86planning based on statistical information provided by the rdf_db module. 87This algorithm reaches optimal performance under quite reasonable 88assumptions while the planning overhead is very reasonable. The 89algorithm is described in "An optimised Semantic Web query language 90implementation in Prolog", available from 91http://www.swi-prolog.org/download/publications/ICLP05-SeRQL.pdf 92 93NOTES: 94 95 * SeRQL LIKE works on resources *and* literals. Do we want this? 96 See http://rdf4j.org/forum/mvnforum/viewthread?thread=275 97- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ 98 99:- multifile 100 user:goal_expansion/2. 101 102usergoal_expansion(rdf_complexity(G0, C), rdf_complexity(G, C)) :- 103 expand_goal(G0, G). 104usergoal_expansion(rdf_optimise(G0, O), rdf_optimise(G, O)) :- 105 expand_goal(G0, G). 106usergoal_expansion(rdf_optimise(G0, O, C, E), rdf_optimise(G, O, C, E)) :- 107 expand_goal(G0, G). 108 109 110/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 111Plan (conjunctions) 112 113 * Generate permutations and costs. Select cheapest 114 * The above moves tests right after the RDF call filling 115 its input argument. As a last optimisation some of these 116 searches may be integrated in the rdf/3 call to help using 117 indexing of the RDF database. 118 119complexity/2 needs to update the order of clauses inside meta calls 120(notably optional path expressions). 121- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
130rdf_optimise(Module:Goal) :-
131 rdf_optimise(Goal, Optimised),
132 call(Module:Optimised).
rdf_db.pl
and SeRQL runtime predicates. Optimized is a semantically
equivalent goal, obtained by re-ordering conjunctions in Goal
and processing sub-queries that do not share variables
independently.143rdf_optimise(Conj, Optimised) :- 144 rdf_optimise(Conj, Optimised, _, _). 145 146rdf_optimise(Conj, Optimised, Space, Time) :- 147 debug(rdf_optimise, '*** OPTIMIZING ***~n', []), 148 dbg_portray_body(Conj), 149 term_variables(Conj, Vars), 150 rdf_complexity(Conj, Conj, S0, E0), 151 State = state(Vars-Conj, S0, E0, 1), 152 debug(rdf_optimise, 'C0 = ~w~n', [E0]), 153 ( reorder(Conj, Perm), 154 ( debugging(rdf_optimise(all)) 155 -> dbg_portray_body(Perm) 156 ; true 157 ), 158 rdf_complexity(Perm, Perm1, S, C), 159 debug(rdf_optimise(all), '--> space=~w, time=~w~n', [S, C]), 160 161 arg(4, State, N), 162 N2 is N + 1, 163 nb_setarg(4, State, N2), 164 165 ( arg(3, State, C0), 166 C < C0 167 -> debug(rdf_optimise, 168 'BETTER ONE [~D]: --> space=~w, time=~w~n', [N, S, C]), 169 dbg_portray_body(Perm1), 170 nb_setarg(3, State, C), 171 nb_setarg(2, State, S), 172 nb_setarg(1, State, Vars-Perm1) 173 ; true 174 ), 175 fail 176 ; arg(1, State, Vars-Optimised), 177 arg(2, State, Space), 178 arg(3, State, Time), 179 debug(rdf_optimise, ' --> optimised: s/t = ~w/~w --> ~w/~w~n', 180 [S0, E0, Space, Time]), 181 dbg_portray_body(Optimised) 182 ), 183 !. 184optimise_order(Conj, Conj, -1, -1) :- 185 debug(rdf_optimise, 'Failed to optimise:~n', []), 186 dbg_portray_body(Conj). 187 188 189 /******************************* 190 * REORDERING * 191 *******************************/
201reorder(Goal, Reordered) :- 202 State = bindings([]), 203 conj_to_list(Goal, Conj0), 204 reorder_conj(Conj0, State, Conj1), 205 list_to_conj(Conj1, Reordered), 206 arg(1, State, Bindings), 207 unbind(Bindings). 208 209reorder_conj([One], _, [One]) :- !. 210reorder_conj(List, State, Perm) :- 211 group_by_cut(List, Before, Cut, After), 212 !, 213 reorder_conj(Before, State, PermBefore), 214 bind_args(Before, State), % this part is done 215 reorder_conj(After, State, PermAfter), 216 append(PermBefore, [Cut|PermAfter], Perm). 217reorder_conj(List, State, Perm) :- 218 group_by_optional(List, Normal, Optional), 219 !, 220 reorder_conj(Normal, State, PermNormal), 221 bind_args(Normal, State), % this part is done 222 reorder_conj(Optional, State, PermOptional), 223 append(PermNormal, PermOptional, Perm). 224reorder_conj(List, State, [goal(_,independent(SubPerms),_)]) :- 225 make_subgraphs(List, SubGraphs), 226 SubGraphs \= [_], 227 !, 228 reorder_subgraph_conjs(SubGraphs, State, SubPerms). 229reorder_conj(List, State, [Prefix|Perm]) :- 230 select(Prefix, List, Rest), 231 bind_args(Prefix, State), 232 make_subgraphs(Rest, SubGraphs), 233 ( SubGraphs = [SubGraph] 234 -> reorder_conj2(SubGraph, State, Perm) 235 ; Perm = [goal(_,independent(SubPerms),_)], 236 reorder_subgraph_conjs(SubGraphs, State, SubPerms) 237 ).
248reorder_subgraph_conjs([], _, []). 249reorder_subgraph_conjs([H0|T0], State, [H|T]) :- 250 reorder_conj2(H0, State, H1), 251 list_to_conj(H1, H), 252 reorder_subgraph_conjs(T0, State, T). 253 254reorder_conj2([One], _, [One]) :- !. 255reorder_conj2(List, State, [Prefix|Perm]) :- 256 select(Prefix, List, Rest), 257 bind_args(Prefix, State), 258 make_subgraphs(Rest, SubGraphs), 259 ( SubGraphs = [SubGraph] 260 -> reorder_conj2(SubGraph, State, Perm) 261 ; Perm = [goal(_,independent(SubPerms),_)], 262 reorder_subgraph_conjs(SubGraphs, State, SubPerms) 263 ).
270group_by_cut(Goals, Before, Cut, After) :-
271 Cut = goal(_, !, _),
272 append(Before, [Cut|After], Goals),
273 !.
281group_by_optional(List, Normal, Optional) :- 282 split_optional(List, Normal, Optional), 283 Normal \== [], 284 Optional \== []. 285 286split_optional([], [], []). 287split_optional([H|T0], Normal, Optional) :- 288 ( optional(H) 289 -> Optional = [H|T], 290 split_optional(T0, Normal, T) 291 ; Normal = [H|T], 292 split_optional(T0, T, Optional) 293 ). 294 295optional(G) :- 296 goal(G, (_*->_;_)).
literal(B)
, generated by optimising where constraints is handled
special.
State is a term bindings(List)
that is destructively maintained
by instantiate/4.
308bind_args([], _) :- !. 309bind_args([H|T], State) :- 310 !, 311 bind_args(H, State), 312 bind_args(T, State). 313bind_args(H, State) :- 314 goal(H, A=literal(B)), 315 var(A), var(B), 316 !, 317 ( instantiated(A, I), 318 I \== (-) 319 -> instantiate(B, _, I, State) 320 ; instantiated(B, I), 321 I \== (-) 322 -> instantiate(A, _, I, State) 323 ; true 324 ). 325bind_args(Goal, State) :- 326 vars(Goal, Vars), 327 ground_vars(Vars, State). 328 329ground_vars([], _). 330ground_vars([H|T], State) :- 331 instantiate(H, _, r, State), 332 ground_vars(T, State).
342make_subgraphs([Goal], [[Goal]]) :- !. 343make_subgraphs([G1,G2], Graphs) :- 344 !, 345 unbound_vars(G1, V1), 346 unbound_vars(G2, V2), 347 ( ord_intersect(V1, V2) 348 -> Graphs = [[G1,G2]] 349 ; Graphs = [[G1],[G2]] 350 ). 351make_subgraphs(Goals, SubGraphs) :- 352 map_list_to_pairs(unbound_vars, Goals, UnBoundKeyed), 353 connected_pairs(UnBoundKeyed, Edges), 354 vertices_edges_to_ugraph(Goals, Edges, UGraph), 355 connected_vertices(UGraph, SubGraphs). 356 357connected_pairs([], []). 358connected_pairs([H|T], Edges) :- 359 connected_pairs(T, H, Edges, EdgeTail), 360 connected_pairs(T, EdgeTail). 361 362connected_pairs([], _, Edges, Edges). 363connected_pairs([H|T], To, Edges, EdgeTail) :- 364 ( connected(H, To, GH, GT) 365 -> Edges = [GH-GT,GT-GH|Edges1], 366 connected_pairs(T, To, Edges1, EdgeTail) 367 ; connected_pairs(T, To, Edges, EdgeTail) 368 ). 369 370connected(V1-G1, V2-G2, G1, G2) :- 371 ord_intersect(V1, V2). 372 373connected_vertices([], []) :- !. 374connected_vertices(UGraph, [Set1|Sets]) :- 375 UGraph = [V1-_|_], 376 reachable(V1, UGraph, Set1), 377 del_vertices(UGraph, Set1, UGraph2), 378 connected_vertices(UGraph2, Sets).
385unbound_vars(Goal, Vars) :- 386 vars(Goal, AllVars), 387 unbound(AllVars, Vars0), 388 sort(Vars0, Vars). 389 390unbound([], []). 391unbound([H|T0], [H|T]) :- 392 instantiated(H, -), 393 !, 394 unbound(T0, T). 395unbound([_|T0], T) :- 396 unbound(T0, T).
! goal(Id, Goal, Vars)
Where Id is a goal identifier, Goal is the goal itself and Vars is a list of variables inside the Goal. Variables are sorted to the standard order of terms.
408conj_to_list(Conj, List) :- 409 phrase(conj_to_list(Conj, 1, _), List). 410 411conj_to_list((A,B), N0, N) --> 412 !, 413 conj_to_list(A, N0, N1), 414 conj_to_list(B, N1, N). 415conj_to_list(true, N, N) --> 416 !, 417 []. 418conj_to_list(G, N0, N) --> 419 { term_variables(G, Vars0), 420 sort(Vars0, Vars), 421 N is N0 + 1 422 }, 423 [ goal(N0, G, Vars) 424 ].
430list_to_conj([], true). 431list_to_conj([goal(_,G,_)], G) :- !. 432list_to_conj([goal(_,G,_)|T0], (G,T)) :- 433 list_to_conj(T0, T).
442%id(goal(Id, _, _), Id). 443goal(goal(_, Goal, _), Goal). 444vars(goal(_, _, Vars), Vars). 445 446 447 /******************************* 448 * BINDING * 449 *******************************/ 450 451/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 452Keep track of binding status of a variable. There are basically three 453ways. One is to use a seperate (assoc) table. Alternatively we can use 454attributed variables and delete the attributes at the end, and finally 455we can use normal variables and recreate the original goal by unbinding 456them again. 457 458Using assocs requires us to pass these things around and involves a 459log(N) complexity lookup. Using real terms has the disadvantage that to 460unbind we have to copy the term, thus loosing bindings it may have with 461the environment. Using attributes suffers neither of these problems and 462its only drawback is relying on non-standard Prolog features. 463 464Note that the remainder of the algorithm uses sets organised to the 465standard order of terms. As putting attributes does not change the 466identity of global stack variables and goals are global stack terms this 467is guaranteed. 468- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
b
). Bindings is a
term bindings(List)
, which is updated using destructive
assignment. List is the list of all variables to which we added
attributes.479instantiate_obj(Arg, Old, New, Bindings) :- 480 ( var(Arg) 481 ; ground(Arg) 482 ), 483 !, 484 instantiate(Arg, Old, New, Bindings). 485instantiate_obj(literal(Pattern, Var), +(Pattern), New, Bindings) :- 486 instantiate(Var, _, New, Bindings). 487 488instantiate(Var, Old, New, Bindings) :- 489 instantiated(Var, Old), 490 ( Old == (-) 491 -> put_attr(Var, instantiated, New), 492 arg(1, Bindings, B0), 493 setarg(1, Bindings, [Var|B0]) 494 ; true 495 ). 496 497instantiated(Term, How) :- 498 ( nonvar(Term) 499 -> How = +(+) 500 ; get_attr(Term, instantiated, H) 501 -> How = +(H) 502 ; How = - 503 ). 504 505uninstantiate(Term, How) :- 506 ( get_attr(Term, instantiated, How) 507 -> del_attr(Term, instantiated) 508 ; true 509 ).
520instantiate_unify(A, B, State) :- 521 instantiated(B, +(_)), 522 !, 523 instantiate(A, _, b, State). 524instantiate_unify(A, B, State) :- 525 instantiated(A, +(_)), 526 !, 527 instantiate(B, _, b, State). 528instantiate_unify(_, _, _).
538instantiatedattr_unify_hook(Attr, Value) :- 539 get_attr(Value, instantiated, Attr). 540 541instantiatedattr_portray_hook(Value, _Var) :- 542 write(+(Value)).
548instantiate_term(Term, How) :- 549 compound(Term), 550 !, 551 functor(Term, _, Arity), 552 instantiate_args(Arity, Term, How). 553instantiate_term(Term, How) :- 554 ( var(Term), 555 \+ get_attr(Term, instantiated, _) 556 -> put_attr(Term, instantiated, How) 557 ; true 558 ). 559 560instantiate_args(0, _, _) :- !. 561instantiate_args(N, Term, How) :- 562 arg(N, Term, A), 563 instantiate_term(A, How), 564 N2 is N - 1, 565 instantiate_args(N2, Term, How).
572uninstantiate_term(Term, How) :- 573 compound(Term), 574 !, 575 functor(Term, _, Arity), 576 uninstantiate_args(Arity, Term, How). 577uninstantiate_term(Term, How) :- 578 uninstantiate(Term, How). 579 580uninstantiate_args(0, _, _) :- !. 581uninstantiate_args(N, Term, How) :- 582 arg(N, Term, A), 583 uninstantiate_term(A, How), 584 N2 is N - 1, 585 uninstantiate_args(N2, Term, How).
593delete_instantiated([], []). 594delete_instantiated([H|T0], L) :- 595 ( instantiated(H, -) 596 -> L = [H|T], 597 delete_instantiated(T0, T) 598 ; delete_instantiated(T0, L) 599 ). 600 601 602 /******************************* 603 * COMPLEXITY * 604 *******************************/
E = 1 + B0 + B0*B1 + B0*B1*B2, ...
Non-RDF calls are supposed to be boolean tests that can be executed at the first opportunity all arguments are bound by RDF calls. They have a probability of failure, reducing the search space. Using the above formula, any number lower than 1 moves the test as far as possible to the head of the conjunction.
If GoalIn and GoalOut are the same the system will not try to optimize local conjunctions.
ISSUES: control structures ;, if-then-else, etc.
626rdf_complexity(Goal, SpaceEstimate, TimeEstimate) :- 627 rdf_complexity(Goal, Goal, SpaceEstimate, TimeEstimate). 628 629rdf_complexity(Goal0, Goal, Space, Time) :- 630 State = bindings([]), 631 complexity(Goal0, Goal, State, 1, Space, 0, Time), 632 arg(1, State, Bindings), 633 unbind(Bindings). 634 635unbind([]). 636unbind([H|T]) :- 637 del_attr(H, instantiated), 638 unbind(T). 639 640% complexity(:GoalIn, -GoalOut, 641% +State, 642% +SpaceIn, -SpaceOut, 643% +CountIn, -CountOut) 644% 645% Compute the complexity of Goal. Vars is an assoc holding the 646% variables bound earlier in the conjunction. Space keeps the size 647% of the search space and Count is the cummulative count of the 648% costs for exploring the search space espressed in the number of 649% nodes that will be visited. 650% 651% The (G*->_=true;_=false) clause deals with the code generated 652% from optional graph specs as provided by SeRQL. 653 654complexity((A0,B0), (A,B), State, Sz0, Sz, C0, C) :- 655 !, 656 complexity(A0, A, State, Sz0, Sz1, C0, C1), 657 complexity(B0, B, State, Sz1, Sz, C1, C). 658complexity((G0*->True;False), 659 ( G*->True;False), State, Sz0, Sz, C0, C) :- 660 !, 661 ( var(G) 662 -> optimise_order(G0, G, Sz1, C1), 663 Sz is Sz0 * Sz1, 664 C is C0+Sz0*C1 665 ; complexity(G, G, State, Sz0, Sz, C0, C) 666 ). 667complexity((If0->Then0;Else0), % dubious 668 ( If->Then; Else), State, Sz0, Sz, C0, C) :- 669 !, 670 ( var(If) 671 -> optimise_order(If0, If, Sz1, C1), 672 optimise_order(Then0, Then, Sz2, C2), 673 optimise_order(Else0, Else, Sz3, C3), 674 Sz is max(Sz0 * Sz1 * Sz2, Sz0 * Sz3), 675 C is C0 + max(Sz0*C1+Sz0*Sz1*C2, Sz0*C1+Sz0*Sz1*C3) 676 ; complexity(If, If, State, Sz0, Sz1, C0, C1), 677 complexity(Then, Then, State, Sz1, Sz2, C1, C2), 678 complexity(Else, Else, State, Sz0, Sz3, C0, C3), 679 Sz is max(Sz2, Sz3), 680 C is max(C2, C3) 681 ). 682complexity((A0;B0), (A;B), State, Sz0, Sz, C0, C) :- 683 !, 684 ( var(A) 685 -> optimise_order(A0, A, _, _), % First try the cheap one? 686 optimise_order(B0, B, _, _) 687 ; A = A0, 688 B = B0 689 ), 690 complexity(A, A, State, Sz0, SzA, C0, CA), 691 complexity(B, B, State, Sz0, SzB, C0, CB), 692 Sz is SzA + SzB, 693 C is CA + CB. 694complexity(sparql_group(G0), sparql_group(G), State, Sz0, Sz, C0, C) :- 695 !, 696 ( var(G) 697 -> rdf_optimise(G0, G, Sz1, C1), 698 Sz is Sz0 * Sz1, 699 C is C0+Sz0*C1 700 ; complexity(G, G, State, Sz0, Sz, C0, C) 701 ). 702complexity(sparql_group(G0, OV, IV), 703 sparql_group(G, OV, IV), 704 State, Sz0, Sz, C0, C) :- 705 !, 706 ( var(G) 707 -> rdf_optimise(G0, G, Sz1, C1), 708 Sz is Sz0 * Sz1, 709 C is C0+Sz0*C1 710 ; complexity(G, G, State, Sz0, Sz, C0, C) 711 ). 712complexity(rdfql_carthesian(Bags), 713 rdfql_carthesian(Bags), State, Sz0, Sz, C0, C) :- 714 !, 715 carth_complexity(Bags, State, Sz0, Sz, C0, 0, C). 716complexity(independent(Goals0), Goal, State, Sz0, Sz, C0, C) :- 717 !, 718 independent_complexity(Goals0, Goal, State, Sz0, Sz, C0, 0, C). 719complexity(Goal, Goal, State, Sz0, Sz, C0, C) :- 720 Goal = member(Var, List), % List is list of resources 721 !, 722 instantiate(Var, _, b, State), 723 length(List, Branch), 724 Sz is Sz0 * Branch, 725 C is C0 + Sz0*0.2 + Sz*0.2. 726complexity(Goal, Goal, State, Sz, Sz, C0, C) :- 727 Goal = (A=B), 728 !, 729 instantiate_unify(A, B, State), 730 C is C0 + 0.2. 731complexity(Goal, Goal, State, Sz, Sz, C0, C) :- 732 Goal = (Var=literal(V)), 733 !, 734 instantiated(V, +(_)), 735 instantiate(Var, _, b, State), 736 C is C0 + 0.2. 737complexity(Goal, Goal, State, Sz0, Sz, C0, C) :- 738 rdf_db_goal(Goal, S, P, O), 739 !, 740 instantiate(S, SI, b, State), 741 instantiate(P, PI, b, State), 742 instantiate_obj(O, OI, b, State), 743 complexity0(SI, PI, OI, P, Goal, SetUp, PerAlt, Branch), 744 Sz is Sz0 * Branch, 745 C is C0 + Sz0*SetUp + Sz*PerAlt, 746 debug(rdf_optimise(complexity), 'Complexity ~p: (~w) ~w --> ~w', 747 [i(SI,PI,OI), Goal, Branch, C]). 748complexity(sparql_eval(E,V), sparql_eval(E,V), _, Sz0, Sz, C0, C) :- 749 !, 750 term_variables(E, Vars), 751 all_bound(Vars), 752 Sz is Sz0, % probability of failure 753 C is C0 + Sz*20. % Sz * CostOfEval 754complexity(sparql_true(E), sparql_true(E), _, Sz0, Sz, C0, C) :- 755 !, 756 term_variables(E, Vars), 757 all_bound(Vars), 758 Sz is Sz0 * 0.5, % probability of failure 759 C is C0 + Sz0*20. % Sz * CostOfEval 760complexity(G, G, _, Sz0, Sz, C0, C) :- % non-rdf tests 761 term_variables(G, Vars), 762 all_bound(Vars), 763 Sz is Sz0 * 0.5, % probability of failure 764 C is C0 + Sz0. % Sz * CostOfTest 765 766all_bound([]). 767all_bound([H|T]) :- 768 instantiated(H, +(_)), 769 all_bound(T).
779carth_complexity([], _, Sz, Sz, _, C, C). 780carth_complexity([bag(_,G)|T], State, 781 Sz0, Sz, 782 C0, Csum0, Csumz) :- 783 complexity(G, G, State, Sz0, Sz1, C0, C1), 784 Csum1 is Csum0 + C1, 785 carth_complexity(T, State, Sz1, Sz, C0, Csum1, Csumz).
798independent_complexity(GoalsIn, Goal, State, 799 Size0, Size, 800 Time0, TimeSum0, TimeSum) :- 801 indep_complexity(GoalsIn, Goals1, State, 802 Size0, Size, 803 Time0, TimeSum0, TimeSum), 804 keysort(Goals1, ByCost), 805 pairs_values(ByCost, Goals2), 806 simplify_carthesian(Goals2, Goal). 807 808indep_complexity([], [], _, Sz, Sz, _, C, C). 809indep_complexity([G0|GT0], [SzG-bag(Vars,G,SzG,CG)|GT], State, 810 Sz0, Sz, 811 C0, Csum0, Csumz) :- 812 complexity(G0, G, State, Sz0, Sz1, C0, C1), 813 term_variables(G, VList), 814 Vars =.. [v|VList], 815 ( Sz0 =:= 0 816 -> SzG = 0, 817 CG = 0 818 ; SzG is Sz1/Sz0, 819 CG is (C1-C0)/Sz0 820 ), 821 Csum1 is Csum0 + C1, 822 indep_complexity(GT0, GT, State, Sz1, Sz, C0, Csum1, Csumz).
830simplify_carthesian([], true). 831simplify_carthesian([bag(_,Goal,_Branch,_Cost)], Goal) :- !. 832simplify_carthesian([bag(_,Goal,Branch,_Cost)|Bags], Final) :- 833 ( Branch < 1.5 834 -> !, Final = (Goal, Final1), 835 simplify_carthesian(Bags, Final1) 836 ; Bags == [] 837 -> !, Final = Goal 838 ). 839simplify_carthesian(Bags, rdfql_carthesian(Bags)).
Literal `like' matches come out as +(like(Pattern)
). We must
estimate the percentage of the literals that match this pattern.
Suppose the factor is 1,000. This means the branching is reduced
by 1,000, but finding each solution is slow as it requires a
linear scan. It is faster than going all the way back to Prolog
backtracking however, so we estimate a factor 10 (TBD: verify
that number).
ISSUES: rdf_has/3 vs rdf_reachable/3.
859complexity0(+(_),+(_),+(_), _, _, 1, 0, 1) :- !. 860complexity0(+(b),+(+),-, P, G, 1, 1, B) :- 861 !, 862 subj_branch_factor(G, B, Prop), 863 rdf_predicate_property(P, Prop). 864complexity0(-,+(+),+(b), P, G, 1, 1, B) :- 865 !, 866 obj_branch_factor(G, B, Prop), 867 rdf_predicate_property(P, Prop). 868complexity0(+(b), -, -, _, _, 1, 1, B) :- 869 !, 870 rdf_statistics(triples(Total)), 871 subject_count(Subjs), 872 ( Total == 0 873 -> B = 0 874 ; B is Total/Subjs 875 ). 876complexity0(_,_,+(like(Pat)),_, G, Factor, Factor, B) :- 877 !, 878 rdf_estimate_complexity(G, B0), 879 pattern_filter(Pat, Factor0), 880 Factor is max(1, min(B0, Factor0)/10), 881 B is B0/Factor. 882complexity0(_,_,_, _, G, 1, 1, B) :- 883 rdf_estimate_complexity(G, B). 884 885:- if(rdf_statistics(subjects(_))). 886subject_count(Count) :- % RDF-DB 2.x 887 rdf_statistics(subjects(Count)). 888:- else. 889subject_count(Count) :- % RDF-DB 3.x 890 rdf_statistics(resources(Count)). 891:- endif. 892 893:- multifile 894 subj_branch_factor/3, 895 obj_branch_factor/3. 896 897subj_branch_factor(rdf(_,_,_), X, rdf_subject_branch_factor(X)). 898subj_branch_factor(rdf_has(_,_,_), X, rdfs_subject_branch_factor(X)). 899subj_branch_factor(rdf_reachable(_,_,_), X, rdfs_subject_branch_factor(X)). 900 901obj_branch_factor(rdf(_,_,_), X, rdf_object_branch_factor(X)). 902obj_branch_factor(rdf_has(_,_,_), X, rdfs_object_branch_factor(X)). 903obj_branch_factor(rdf_reachable(_,_,_), X, rdfs_object_branch_factor(X)).
915:- multifile 916 rdf_db_goal/4. 917 918rdf_db_goal(rdf(S,P,O), S,P,O). 919rdf_db_goal(rdf_has(S,P,O), S,P,O). 920rdf_db_goal(rdf_reachable(S,P,O), S,P,O). 921rdf_db_goal(rdf(S,P,O, _DB), S,P,O). % TBD: less hits
928pattern_filter(Like, Factor) :- 929 atom_codes(Like, Codes), 930 pattern_factor(Codes, 1, Factor). 931 932pattern_factor([], F, F). 933pattern_factor([0'*|T], F0, F) :- 934 !, 935 pattern_factor(T, F0, F). 936pattern_factor([_|T], F0, F) :- 937 F1 is F0*10, 938 pattern_factor(T, F1, F).
In addition, rdf_reachable/3 introduces its own complexity which must be estimate using the branching factor of the relation.
950rdf_estimate_complexity(G, C) :- 951 rdf_db_goal(G, S, P0, O), 952 map_predicate(P0, P), 953 rdf_estimate_complexity(S, P, O, C). 954 955map(map_predicate(_,_)). 956map(map_predicate(_,_):-_). 957 958term_expansion(In, Out) :- 959 map(In), 960 !, 961 rdf_global_term(In, Out). 962 963map_predicate(X, X) :- 964 var(X), 965 !. 966map_predicate(serql:directSubClassOf, rdfs:subClassOf) :- !. 967map_predicate(serql:directType, rdf:type) :- !. 968map_predicate(serql:directSubPropertyOf, rdfs:subPropertyOf) :- !. 969map_predicate(X, X). 970 971 972 /******************************* 973 * INSTANTIATE OPTIONAL * 974 *******************************/ 975 976/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 977In SELECT queries, optional parts of the path expression leave 978uninstantiated variables. These must be bound to '$null$' to be able to 979do correct merging for DISTINCT. The naive way to do this is to 980instantiate all variables at the end of the query. On large selects 981(i.e. involving many variables) this appears to be quite costly. Doing 982the job early, as in 983 984 ( Goal 985 *-> true 986 ; bind_null(VarsInGoal) 987 ) 988 989is not correct as well, as VarsInGoal may be involved in other parts of 990the code either before or after the optional path expression. So we need 991to: 992 993 * Do abtract execution and find the bindings done before arriving 994 at Goal. 995 * continue the execution, and watch for new bindings to these 996 variables. If we find a binding for the second time, remove 997 it from the first and make a conditional binding for it. 998 999If we bind an argument unconditionally, we place an attribute 'b'. If it 1000is conditionally bound, we place an attribute c(Set), where Set is the 1001set in which it was bound or plain 'c' if it was conditionally bound in 1002multiple places. 1003 1004TBD: disjunctions and other control structures. 1005 queries that do not bind (such as SPARQL bound(X)) 1006- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ 1007 1008serql_select_bind_null(Goal0, Goal) :- 1009 State = bindings([]), 1010 select_bind_null(Goal0, Goal1, State), 1011 arg(1, State, Bindings), 1012 c_unbind(Bindings, Left), 1013 ( Left == [] 1014 -> Goal = Goal1 1015 ; Goal = (Goal1, rdfql_cond_bind_null(Left)) 1016 ). 1017 1018c_unbind([], []). 1019c_unbind([H|T0], L) :- 1020 ( get_attr(H, instantiated, c) 1021 -> L = [H|T] 1022 ; L = T 1023 ), 1024 del_attr(H, instantiated), 1025 c_unbind(T0, T). 1026 1027 1028select_bind_null((A0,B0), (A,B), State) :- 1029 !, 1030 select_bind_null(A0, A, State), 1031 select_bind_null(B0, B, State). 1032select_bind_null((G0 *-> true; true), 1033 ( G *-> true; Bind), 1034 State) :- 1035 !, 1036 arg(1, State, B0), 1037 select_bind_null(G0, G, State), 1038 arg(1, State, B), 1039 Bind = rdfql_bind_null(Vars), 1040 c_bindings(B, B0, c(Bind), Vars). 1041select_bind_null(rdfql_carthesian(List0), 1042 rdfql_carthesian(List), State) :- 1043 !, 1044 select_carth_bind_null(List0, List, State). 1045select_bind_null(sparql_group(A0), sparql_group(A), State) :- 1046 !, 1047 select_bind_null(A0, A, State). 1048select_bind_null(sparql_group(A0,Outer,Inner), 1049 sparql_group(A, Outer,Inner), 1050 State) :- 1051 !, 1052 select_bind_null(A0, A, State), 1053 term_variables(Outer, Vars), 1054 c_bind(Vars, State). 1055select_bind_null(Goal, Goal, State) :- 1056 term_variables(Goal, Vars), 1057 c_bind(Vars, State).
1067c_bindings(B, B0, _, Vars) :- 1068 B == B0, 1069 !, 1070 Vars = []. 1071c_bindings([H|T0], B0, Attr, [H|Vars]) :- 1072 get_attr(H, instantiated, I), 1073 is_instantiated(I), 1074 !, 1075 put_attr(H, instantiated, Attr), 1076 c_bindings(T0, B0, Attr, Vars). 1077c_bindings([_|T0], B0, Attr, Vars) :- 1078 c_bindings(T0, B0, Attr, Vars). 1079 1080 1081is_instantiated(b). % unconditionally bound 1082is_instantiated(c(_)). % bound either by call or rdfql_bind_null/1
1091c_bind([], _). 1092c_bind([H|T], State) :- 1093 ( get_attr(H, instantiated, I) 1094 -> ( I == b % already unconditionally bound 1095 -> true 1096 ; I = c(Set) 1097 -> arg(1, Set, Vars0), 1098 del_var(H, Vars0, Vars), 1099 setarg(1, Set, Vars), 1100 put_attr(H, instantiated, c) 1101 ; I == c 1102 -> true 1103 ) 1104 ; put_attr(H, instantiated, b), 1105 arg(1, State, B0), 1106 setarg(1, State, [H|B0]) 1107 ), 1108 c_bind(T, State). 1109 1110del_var(H, [X|T0], T) :- 1111 ( H == X 1112 -> T = T0 1113 ; T = [X|T1], 1114 del_var(H, T0, T1) 1115 ). 1116 1117select_carth_bind_null([], [], _). 1118select_carth_bind_null([bag(Vars, G0)|T0], [bag(Vars, G)|T], State) :- 1119 select_bind_null(G0, G, State), 1120 select_carth_bind_null(T0, T, State). 1121 1122 1123 /******************************* 1124 * DEBUG SUPPORT * 1125 *******************************/ 1126 1127dbg_portray_body(G) :- 1128 debugging(rdf_optimise), 1129 !, 1130 portray_body(G). 1131dbg_portray_body(_). 1132 1133portray_body(G) :- 1134 ( pp_instantiate_term(G), 1135 debug(_, '~@', 1136 [ portray_clause(current_output, (<> :- G), [module(sparql_runtime)]) 1137 ]), 1138 fail 1139 ; true 1140 ). 1141 1142pp_instantiate_term(Term) :- 1143 compound(Term), 1144 !, 1145 functor(Term, _, Arity), 1146 pp_instantiate_args(Arity, Term). 1147pp_instantiate_term(Term) :- 1148 var(Term), 1149 get_attr(Term, instantiated, H), 1150 !, 1151 del_attr(Term, instantiated), 1152 Term = +(H). 1153pp_instantiate_term(_). 1154 1155pp_instantiate_args(0, _) :- !. 1156pp_instantiate_args(N, Term) :- 1157 arg(N, Term, A), 1158 pp_instantiate_term(A), 1159 N2 is N - 1, 1160 pp_instantiate_args(N2, Term). 1161 1162 1163 /******************************* 1164 * SANDBOX * 1165 *******************************/ 1166 1167:- multifile sandbox:safe_meta_predicate/1. 1168 1169sandbox:safe_meta_predicate(rdf_optimise:rdf_optimise/1)