View source with raw comments or as raw
    1/*  Part of ClioPatria SeRQL and SPARQL server
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2010-2018, University of Amsterdam,
    7                              VU University Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(url_cache,
   37          [ url_cache/3,                % +URI, -File, -MimeType
   38            url_cache_file/4,           % +URL, +Dir, +Ext, -Path)
   39            url_cache_delete/1,         % +URI
   40            url_cached/2,               % ?URL, ?Property
   41            url_cached/3,               % +Dir, ?URL, ?Property
   42            url_cache_reset_server_status/0,
   43            url_cache_reset_server_status/1 % +Server
   44          ]).   45:- use_module(library(http/http_open)).   46:- if(exists_source(library(http/http_ssl_plugin))).   47:- use_module(library(http/http_ssl_plugin)).   48:- endif.   49:- use_module(library(http/mimetype)).   50:- use_module(library(url)).   51:- use_module(library(debug)).   52:- use_module(library(error)).   53:- use_module(library(settings)).   54:- use_module(library(base64)).   55:- use_module(library(utf8)).   56:- use_module(library(lists)).   57:- use_module(library(sha)).   58
   59:- setting(cache:url_cache_directory, atom, 'cache/url',
   60           'Directory to cache fetched remote URLs').

Cache the content of external URLs in local files

This library provides a cache for data stored in extenal URLs. The content of each URL is kept in a file and described by a meta-file that remembers the mime-type, the original URL, when it was fetched and -if provided by the server- the last-modified stamp.

To be done
- The current implementation does not validate the cache content, nor does it honour the HTTP cache directives. */
 url_cache(+URI:atom, -Path:atom, -MimeType:atom) is det
Return the content of URI in a file at Path. MimeType is the Mime-type returned by the server.
Errors
- existence_error(url, URL) Server did not respond with 200 OK
- existence_error(source_sink, url_cache(.)) Cache directory does not exist
bug
- Does not check modification time and cache validity
   85url_cache(URL, Path, MimeType) :-
   86    url_cache_dir(Dir),
   87    url_cache_file(URL, Dir, url, Path),
   88    atom_concat(Path, '.meta', TypeFile),
   89    (   exists_file(Path),
   90        exists_file(TypeFile),
   91        read_meta_file(TypeFile, mime_type(MimeType0))
   92    ->  MimeType = MimeType0
   93    ;   fetch_url(URL, Path, MimeType, Modified),
   94        get_time(NowF),
   95        Now is round(NowF),
   96        open(TypeFile, write, Out,
   97             [ encoding(utf8),
   98               lock(write)
   99             ]),
  100        format(Out,
  101               'mime_type(~q).~n\c
  102                    url(~q).~n\c
  103                    fetched(~q).~n',
  104               [MimeType, URL, Now]),
  105        (   nonvar(Modified)
  106        ->  format(Out, 'last_modified(~q).~n', [Modified])
  107        ;   true
  108        ),
  109        close(Out)
  110    ).
  111
  112read_meta_file(MimeFile, Term) :-
  113    setup_call_cleanup(open(MimeFile, read, In,
  114                            [ encoding(utf8),
  115                              lock(read)
  116                            ]),
  117                       ndet_read(In, Term),
  118                       close(In)).
  119
  120ndet_read(Stream, Term) :-
  121    repeat,
  122    read(Stream, Term0),
  123    (   Term0 == end_of_file
  124    ->  !, fail
  125    ;   Term = Term0
  126    ).
 url_cache_delete(+URL) is det
Delete an URL from the cache. Succeeds, even if the cache files do not exist.
Errors
- Throws exceptions from delete_file/1 other than existence errors.
  136url_cache_delete(URL) :-
  137    url_cache_dir(Dir),
  138    url_cache_file(URL, Dir, url, Path),
  139    atom_concat(Path, '.meta', TypeFile),
  140    catch(delete_file(TypeFile), E0, true),
  141    catch(delete_file(Path), E1, true),
  142    error_ok(E0),
  143    error_ok(E1).
  144
  145error_ok(E) :-
  146    subsumes_term(error(existence_error(file, _), _), E),
  147    !.
  148error_ok(E) :-
  149    throw(E).
 url_cache_dir(-Dir) is det
Return or create the URL caching directory
  155url_cache_dir(Dir) :-
  156    setting(cache:url_cache_directory, Dir),
  157    make_directory_path(Dir).
 make_directory_path(+Dir) is det
Create Dir and all required components.
  163make_directory_path(Dir) :-
  164    make_directory_path_2(Dir),
  165    !.
  166make_directory_path(Dir) :-
  167    permission_error(create, directory, Dir).
  168
  169make_directory_path_2(Dir) :-
  170    exists_directory(Dir),
  171    !.
  172make_directory_path_2(Dir) :-
  173    Dir \== (/),
  174    !,
  175    file_directory_name(Dir, Parent),
  176    make_directory_path_2(Parent),
  177    make_directory(Dir).
 fetch_url(+URL:atom, +Path:atom, -MimeType:atom) is det
Errors
- existence_error(url, URL)
  183fetch_url(URL, File, MimeType, Modified) :-
  184    parse_url_ex(URL, Parts),
  185    server(Parts, Server),
  186    (   allow(Server)
  187    ->  true
  188    ;   throw(error(existence_error(url, URL),
  189                    context(url_cache/3, 'Too many errors from server')))
  190    ),
  191    get_time(Now),
  192    (   catch(fetch_url_raw(URL, File,
  193                            MimeType, Modified), E, true)
  194    ->  (   var(E)
  195        ->  register_stats(Server, Now, true)
  196        ;   register_stats(Server, Now, error(E)),
  197            throw(E)
  198        )
  199    ;   register_stats(Server, Now, false)
  200    ).
  201
  202server(Parts, Server) :-
  203    memberchk(host(Host), Parts),
  204    !,
  205    (   memberchk(port(Port), Parts)
  206    ->  Server = Host:Port
  207    ;   Server = Host
  208    ).
  209server(_,_) :-
  210    assertion(false).
  211
  212/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  213Server status assessment. We keep a   health-status  of the server using
  214the following rules:
  215
  216    * Range -100 .. 100
  217    * Ok if > 0
  218    * The initial status is 100 (healthy)
  219    * Possitive results add 20-4*sqrt(Time)
  220    * Negative results subtract 10
  221    * Add 1 per minute since last status.
  222
  223TBD: frequency matters: requests should not pile up.
  224- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  225
  226:- dynamic
  227    server_status/3.                % Server, Status, Last
  228
  229allow(Server) :-
  230    server_status(Server, Status),
  231    debug(url_cache, 'Status ~q: ~w', [Server, Status]),
  232    Status > 0.
  233
  234server_status(Server, Status) :-
  235    get_time(Now),
  236    with_mutex(url_cache_status,
  237               server_status(Server, S0, T0)),
  238    !,
  239    Status is min(100, S0 + round(Now-T0)//60).
  240server_status(_, 100).
  241
  242register_stats(Server, Start, Result) :-
  243    get_time(Now),
  244    Time is Now - Start,
  245    (   server_status(Server, S0, T0)
  246    ->  true
  247    ;   S0 = 100,
  248        T0 = Now
  249    ),
  250    Since is Start - T0,
  251    update_status(Result, Time, Since, S0, S1),
  252    with_mutex(url_cache_status,
  253               (   retractall(server_status(Server, _, _)),
  254                   assert(server_status(Server, S1, Start)))).
  255
  256update_status(true, Time, Since, S0, S) :-
  257    !,
  258    S is min(100, S0 + round(20-4*sqrt(Time)) + round(Since)//60).
  259update_status(_, Time, _Since, S0, S) :-
  260    !,
  261    S is max(-100, S0 - (10 + round(Time))).
 url_cache_reset_server_status is det
 url_cache_reset_server_status(+Server) is det
Reset the status of the given server or all servers.
  269url_cache_reset_server_status :-
  270    with_mutex(url_cache_status,
  271               retractall(server_status(_,_,_))).
  272url_cache_reset_server_status(Server) :-
  273    must_be(atom, Server),
  274    with_mutex(url_cache_status,
  275               retractall(server_status(Server,_,_))).
 fetch_url_raw(+URL:atom, +Path:atom, -MimeType:atom, -Modified) is det
Fetch data from URL and put it into the file Path. MimeType is unified with the MIME-type as reported by the server or text/plain if the server did not provide a MIME-Type.
Errors
- existence_error(url, URL)
  286fetch_url_raw(URL, File, MimeType, Modified) :-
  287    debug(url_cache, 'Downloading ~w ...', [URL]),
  288    atom_concat(File, '.tmp', TmpFile),
  289    (   catch(fetch_to_file(URL, TmpFile, Code, Header), E, true)
  290    ->  true
  291    ;   E = predicate_failed(http_get/3)
  292    ),
  293    (   var(E)
  294    ->  true
  295    ;   (   debugging(url_cache)
  296        ->  print_message(error, E)
  297        ;   true
  298        ),
  299        catch(delete_file(TmpFile), _, true),
  300        (   debugging(url_cache)
  301        ->  message_to_string(E, Msg),
  302            debug(url_cache, 'Download failed: ~w', [Msg])
  303        ;   true
  304        ),
  305        throw(E)
  306    ),
  307    (   Code == 200
  308    ->  rename_file(TmpFile, File)
  309    ;   catch(delete_file(TmpFile), _, true),
  310        throw(error(existence_error(url, URL), _))
  311    ),
  312    (   memberchk(content_type(MimeType0), Header)
  313    ->  true
  314    ;   MimeType0 = 'text/plain'
  315    ),
  316    ignore(memberchk(last_modified(Modified), Header)),
  317    debug(url_cache, 'Downloaded ~w, mime-type: ~w',
  318          [URL, MimeType0]),
  319    MimeType = MimeType0.
  320
  321fetch_to_file(URL, File, Code,
  322              [ content_type(ContentType),
  323                last_modified(LastModified)
  324              ]) :-
  325    setup_call_cleanup(
  326        open(File, write, Out, [ type(binary) ]),
  327        setup_call_cleanup(
  328            http_open(URL, In,
  329                      [ header(content_type, ContentType),
  330                        header(last_modified, LastModified),
  331                        status_code(Code),
  332                        cert_verify_hook(ssl_verify)
  333                      ]),
  334            copy_stream_data(In, Out),
  335            close(In)),
  336        close(Out)).
  337
  338:- public ssl_verify/5.
 ssl_verify(+SSL, +ProblemCert, +AllCerts, +FirstCert, +Error)
Currently we accept all certificates.
  344ssl_verify(_SSL,
  345           _ProblemCertificate, _AllCertificates, _FirstCertificate,
  346           _Error).
  347
  348parse_url_ex(URL, Parts) :-
  349    is_list(URL),
  350    !,
  351    Parts = URL.
  352parse_url_ex(URL, Parts) :-
  353    parse_url(URL, Parts),
  354    !.
  355parse_url_ex(URL, _) :-
  356    domain_error(url, URL).
 url_cache_file(+URL, +Dir, +Ext, -Path) is det
Determine location of cache-file for the given URL in Dir. If Ext is provided, the returned Path is ensured to have the specified extension.
  364url_cache_file(URL, Dir, Ext, Path) :-
  365    url_to_file(URL, Ext, File),
  366    sub_atom(File, 0, 2, _, L1),
  367    ensure_dir(Dir, L1, Dir1),
  368    sub_atom(File, 2, 2, _, L2),
  369    ensure_dir(Dir1, L2, Dir2),
  370    sub_atom(File, 4, _, 0, LocalFile),
  371    atomic_list_concat([Dir2, /, LocalFile], Path).
  372
  373ensure_dir(D0, Sub, Dir) :-
  374    atomic_list_concat([D0, /, Sub], Dir),
  375    (   exists_directory(Dir)
  376    ->  true
  377    ;   make_directory(Dir)
  378    ).
 url_to_file(+URL, +Ext, -File) is det
File is a filename for storing URL and has extension Ext. We use a cryptographic hash to ensure consistent naming, a name that is guaranteed to fit in every sensible filesystem and ensure a good distribution of the cache directories.
  387url_to_file(URL, Ext, File) :-
  388    sha_hash(URL, Hash, []),
  389    phrase(hex_digits(Hash), Codes),
  390    string_to_list(String, Codes),
  391    file_name_extension(String, Ext, File).
  392
  393hex_digits([]) -->
  394    "".
  395hex_digits([H|T]) -->
  396    byte(H),
  397    hex_digits(T).
  398
  399byte(Byte) -->
  400    { High is (Byte>>4) /\ 0xf,
  401      Low is (Byte /\ 0xf),
  402      code_type(H, xdigit(High)),
  403      code_type(L, xdigit(Low))
  404    },
  405    [H,L].
  406
  407
  408                 /*******************************
  409                 *          READ CACHE          *
  410                 *******************************/
 url_cached(?URL, ?Property) is nondet
 url_cached(+Dir, ?URL, ?Property) is nondet
True if URL is in the cache represented by the directory Dir and has Property. Defined properties are:
file(-File)
File is the cache-file for the given URL
mime_type(-Mime)
Mime is the mime-type of the URL as reported by the server
fetched(-Stamp:integer)
Timestamp that specifies when the URL was fetched
last_modified(-Modified:atom)
If present, this is the modification time as provided by the server.
  428url_cached(URL, Property) :-
  429    url_cache_dir(Dir),
  430    url_cached(Dir, URL, Property).
  431
  432url_cached(Dir, URL, Property) :-
  433    nonvar(URL),
  434    !,
  435    url_cache_file(URL, Dir, url, Path),
  436    atom_concat(Path, '.meta', MetaFile),
  437    exists_file(MetaFile),
  438    cache_file_property(Property, MetaFile).
  439url_cached(Dir, URL, Property) :-
  440    nonvar(Property),
  441    Property = file(File),
  442    atom(File),
  443    atom_concat(Dir, Rest, File),
  444    \+ sub_atom(Rest, _, _, _, '../'),
  445    file_name_extension(Base, url, File),
  446    file_name_extension(Base, meta, MetaFile),
  447    exists_file(MetaFile),
  448    once(read_meta_file(MetaFile, url(URL))).
  449url_cached(Dir, URL, Property) :-
  450    atom_concat(Dir, '/??', TopPat),
  451    expand_file_name(TopPat, TopDirs),
  452    member(TopDir, TopDirs),
  453    atom_concat(TopDir, '/??', DirPat),
  454    expand_file_name(DirPat, FileDirs),
  455    member(FileDir, FileDirs),
  456    atom_concat(FileDir, '/*.meta', FilePat),
  457    expand_file_name(FilePat, MetaFiles),
  458    member(MetaFile, MetaFiles),
  459    once(read_meta_file(MetaFile, url(URL))),
  460    check_cache_file(MetaFile, URL),
  461    cache_file_property(Property, MetaFile).
  462
  463check_cache_file(MetaFile, URL) :-
  464    file_name_extension(File, meta, MetaFile),
  465    (   exists_file(File)
  466    ->  true
  467    ;   print_message(warning, url_cache(no_file(File, MetaFile, URL))),
  468        delete_file(MetaFile),
  469        fail
  470    ).
  471
  472cache_file_property(Property, MetaFile) :-
  473    var(Property),
  474    !,
  475    cache_file_property_ndet(Property, MetaFile).
  476cache_file_property(Property, MetaFile) :-
  477    cache_file_property_ndet(Property, MetaFile),
  478    !.
  479
  480
  481cache_file_property_ndet(file(File), MetaFile) :-
  482    file_name_extension(File, meta, MetaFile).
  483cache_file_property_ndet(P, MetaFile) :-
  484    read_meta_file(MetaFile, P),
  485    P \= url(_).
  486
  487                 /*******************************
  488                 *           MESSAGES           *
  489                 *******************************/
  490
  491:- multifile
  492    prolog:message//1.  493
  494prolog:message(url_cache(no_file(File, _MetaFile, URL))) -->
  495    [ 'URL Cache: file ~q does not exist (URL=~q)'-[File, URL] ]