35
36:- module(url_cache,
37 [ url_cache/3, 38 url_cache_file/4, 39 url_cache_delete/1, 40 url_cached/2, 41 url_cached/3, 42 url_cache_reset_server_status/0,
43 url_cache_reset_server_status/1 44 ]). 45:- use_module(library(http/http_open)). 46:- if(exists_source(library(http/http_ssl_plugin))). 47:- use_module(library(http/http_ssl_plugin)). 48:- endif. 49:- use_module(library(http/mimetype)). 50:- use_module(library(url)). 51:- use_module(library(debug)). 52:- use_module(library(error)). 53:- use_module(library(settings)). 54:- use_module(library(base64)). 55:- use_module(library(utf8)). 56:- use_module(library(lists)). 57:- use_module(library(sha)). 58
59:- setting(cache:url_cache_directory, atom, 'cache/url',
60 'Directory to cache fetched remote URLs').
85url_cache(URL, Path, MimeType) :-
86 url_cache_dir(Dir),
87 url_cache_file(URL, Dir, url, Path),
88 atom_concat(Path, '.meta', TypeFile),
89 ( exists_file(Path),
90 exists_file(TypeFile),
91 read_meta_file(TypeFile, mime_type(MimeType0))
92 -> MimeType = MimeType0
93 ; fetch_url(URL, Path, MimeType, Modified),
94 get_time(NowF),
95 Now is round(NowF),
96 open(TypeFile, write, Out,
97 [ encoding(utf8),
98 lock(write)
99 ]),
100 format(Out,
101 'mime_type(~q).~n\c
102 url(~q).~n\c
103 fetched(~q).~n',
104 [MimeType, URL, Now]),
105 ( nonvar(Modified)
106 -> format(Out, 'last_modified(~q).~n', [Modified])
107 ; true
108 ),
109 close(Out)
110 ).
111
112read_meta_file(MimeFile, Term) :-
113 setup_call_cleanup(open(MimeFile, read, In,
114 [ encoding(utf8),
115 lock(read)
116 ]),
117 ndet_read(In, Term),
118 close(In)).
119
120ndet_read(Stream, Term) :-
121 repeat,
122 read(Stream, Term0),
123 ( Term0 == end_of_file
124 -> !, fail
125 ; Term = Term0
126 ).
136url_cache_delete(URL) :-
137 url_cache_dir(Dir),
138 url_cache_file(URL, Dir, url, Path),
139 atom_concat(Path, '.meta', TypeFile),
140 catch(delete_file(TypeFile), E0, true),
141 catch(delete_file(Path), E1, true),
142 error_ok(E0),
143 error_ok(E1).
144
145error_ok(E) :-
146 subsumes_term(error(existence_error(file, _), _), E),
147 !.
148error_ok(E) :-
149 throw(E).
155url_cache_dir(Dir) :-
156 setting(cache:url_cache_directory, Dir),
157 make_directory_path(Dir).
163make_directory_path(Dir) :-
164 make_directory_path_2(Dir),
165 !.
166make_directory_path(Dir) :-
167 permission_error(create, directory, Dir).
168
169make_directory_path_2(Dir) :-
170 exists_directory(Dir),
171 !.
172make_directory_path_2(Dir) :-
173 Dir \== (/),
174 !,
175 file_directory_name(Dir, Parent),
176 make_directory_path_2(Parent),
177 make_directory(Dir).
183fetch_url(URL, File, MimeType, Modified) :-
184 parse_url_ex(URL, Parts),
185 server(Parts, Server),
186 ( allow(Server)
187 -> true
188 ; throw(error(existence_error(url, URL),
189 context(url_cache/3, 'Too many errors from server')))
190 ),
191 get_time(Now),
192 ( catch(fetch_url_raw(URL, File,
193 MimeType, Modified), E, true)
194 -> ( var(E)
195 -> register_stats(Server, Now, true)
196 ; register_stats(Server, Now, error(E)),
197 throw(E)
198 )
199 ; register_stats(Server, Now, false)
200 ).
201
202server(Parts, Server) :-
203 memberchk(host(Host), Parts),
204 !,
205 ( memberchk(port(Port), Parts)
206 -> Server = Host:Port
207 ; Server = Host
208 ).
209server(_,_) :-
210 assertion(false).
211
225
226:- dynamic
227 server_status/3. 228
229allow(Server) :-
230 server_status(Server, Status),
231 debug(url_cache, 'Status ~q: ~w', [Server, Status]),
232 Status > 0.
233
234server_status(Server, Status) :-
235 get_time(Now),
236 with_mutex(url_cache_status,
237 server_status(Server, S0, T0)),
238 !,
239 Status is min(100, S0 + round(Now-T0)//60).
240server_status(_, 100).
241
242register_stats(Server, Start, Result) :-
243 get_time(Now),
244 Time is Now - Start,
245 ( server_status(Server, S0, T0)
246 -> true
247 ; S0 = 100,
248 T0 = Now
249 ),
250 Since is Start - T0,
251 update_status(Result, Time, Since, S0, S1),
252 with_mutex(url_cache_status,
253 ( retractall(server_status(Server, _, _)),
254 assert(server_status(Server, S1, Start)))).
255
256update_status(true, Time, Since, S0, S) :-
257 !,
258 S is min(100, S0 + round(20-4*sqrt(Time)) + round(Since)//60).
259update_status(_, Time, _Since, S0, S) :-
260 !,
261 S is max(-100, S0 - (10 + round(Time))).
269url_cache_reset_server_status :-
270 with_mutex(url_cache_status,
271 retractall(server_status(_,_,_))).
272url_cache_reset_server_status(Server) :-
273 must_be(atom, Server),
274 with_mutex(url_cache_status,
275 retractall(server_status(Server,_,_))).
286fetch_url_raw(URL, File, MimeType, Modified) :-
287 debug(url_cache, 'Downloading ~w ...', [URL]),
288 atom_concat(File, '.tmp', TmpFile),
289 ( catch(fetch_to_file(URL, TmpFile, Code, Header), E, true)
290 -> true
291 ; E = predicate_failed(http_get/3)
292 ),
293 ( var(E)
294 -> true
295 ; ( debugging(url_cache)
296 -> print_message(error, E)
297 ; true
298 ),
299 catch(delete_file(TmpFile), _, true),
300 ( debugging(url_cache)
301 -> message_to_string(E, Msg),
302 debug(url_cache, 'Download failed: ~w', [Msg])
303 ; true
304 ),
305 throw(E)
306 ),
307 ( Code == 200
308 -> rename_file(TmpFile, File)
309 ; catch(delete_file(TmpFile), _, true),
310 throw(error(existence_error(url, URL), _))
311 ),
312 ( memberchk(content_type(MimeType0), Header)
313 -> true
314 ; MimeType0 = 'text/plain'
315 ),
316 ignore(memberchk(last_modified(Modified), Header)),
317 debug(url_cache, 'Downloaded ~w, mime-type: ~w',
318 [URL, MimeType0]),
319 MimeType = MimeType0.
320
321fetch_to_file(URL, File, Code,
322 [ content_type(ContentType),
323 last_modified(LastModified)
324 ]) :-
325 setup_call_cleanup(
326 open(File, write, Out, [ type(binary) ]),
327 setup_call_cleanup(
328 http_open(URL, In,
329 [ header(content_type, ContentType),
330 header(last_modified, LastModified),
331 status_code(Code),
332 cert_verify_hook(ssl_verify)
333 ]),
334 copy_stream_data(In, Out),
335 close(In)),
336 close(Out)).
337
338:- public ssl_verify/5.
344ssl_verify(_SSL,
345 _ProblemCertificate, _AllCertificates, _FirstCertificate,
346 _Error).
347
348parse_url_ex(URL, Parts) :-
349 is_list(URL),
350 !,
351 Parts = URL.
352parse_url_ex(URL, Parts) :-
353 parse_url(URL, Parts),
354 !.
355parse_url_ex(URL, _) :-
356 domain_error(url, URL).
364url_cache_file(URL, Dir, Ext, Path) :-
365 url_to_file(URL, Ext, File),
366 sub_atom(File, 0, 2, _, L1),
367 ensure_dir(Dir, L1, Dir1),
368 sub_atom(File, 2, 2, _, L2),
369 ensure_dir(Dir1, L2, Dir2),
370 sub_atom(File, 4, _, 0, LocalFile),
371 atomic_list_concat([Dir2, /, LocalFile], Path).
372
373ensure_dir(D0, Sub, Dir) :-
374 atomic_list_concat([D0, /, Sub], Dir),
375 ( exists_directory(Dir)
376 -> true
377 ; make_directory(Dir)
378 ).
387url_to_file(URL, Ext, File) :-
388 sha_hash(URL, Hash, []),
389 phrase(hex_digits(Hash), Codes),
390 string_to_list(String, Codes),
391 file_name_extension(String, Ext, File).
392
393hex_digits([]) -->
394 "".
395hex_digits([H|T]) -->
396 byte(H),
397 hex_digits(T).
398
399byte(Byte) -->
400 { High is (Byte>>4) /\ 0xf,
401 Low is (Byte /\ 0xf),
402 code_type(H, xdigit(High)),
403 code_type(L, xdigit(Low))
404 },
405 [H,L].
406
407
408
428url_cached(URL, Property) :-
429 url_cache_dir(Dir),
430 url_cached(Dir, URL, Property).
431
432url_cached(Dir, URL, Property) :-
433 nonvar(URL),
434 !,
435 url_cache_file(URL, Dir, url, Path),
436 atom_concat(Path, '.meta', MetaFile),
437 exists_file(MetaFile),
438 cache_file_property(Property, MetaFile).
439url_cached(Dir, URL, Property) :-
440 nonvar(Property),
441 Property = file(File),
442 atom(File),
443 atom_concat(Dir, Rest, File),
444 \+ sub_atom(Rest, _, _, _, '../'),
445 file_name_extension(Base, url, File),
446 file_name_extension(Base, meta, MetaFile),
447 exists_file(MetaFile),
448 once(read_meta_file(MetaFile, url(URL))).
449url_cached(Dir, URL, Property) :-
450 atom_concat(Dir, '/??', TopPat),
451 expand_file_name(TopPat, TopDirs),
452 member(TopDir, TopDirs),
453 atom_concat(TopDir, '/??', DirPat),
454 expand_file_name(DirPat, FileDirs),
455 member(FileDir, FileDirs),
456 atom_concat(FileDir, '/*.meta', FilePat),
457 expand_file_name(FilePat, MetaFiles),
458 member(MetaFile, MetaFiles),
459 once(read_meta_file(MetaFile, url(URL))),
460 check_cache_file(MetaFile, URL),
461 cache_file_property(Property, MetaFile).
462
463check_cache_file(MetaFile, URL) :-
464 file_name_extension(File, meta, MetaFile),
465 ( exists_file(File)
466 -> true
467 ; print_message(warning, url_cache(no_file(File, MetaFile, URL))),
468 delete_file(MetaFile),
469 fail
470 ).
471
472cache_file_property(Property, MetaFile) :-
473 var(Property),
474 !,
475 cache_file_property_ndet(Property, MetaFile).
476cache_file_property(Property, MetaFile) :-
477 cache_file_property_ndet(Property, MetaFile),
478 !.
479
480
481cache_file_property_ndet(file(File), MetaFile) :-
482 file_name_extension(File, meta, MetaFile).
483cache_file_property_ndet(P, MetaFile) :-
484 read_meta_file(MetaFile, P),
485 P \= url(_).
486
487 490
491:- multifile
492 prolog:message//1. 493
494prolog:message(url_cache(no_file(File, _MetaFile, URL))) -->
495 [ 'URL Cache: file ~q does not exist (URL=~q)'-[File, URL] ]
Cache the content of external URLs in local files
This library provides a cache for data stored in extenal URLs. The content of each URL is kept in a file and described by a meta-file that remembers the mime-type, the original URL, when it was fetched and -if provided by the server- the last-modified stamp.