1 /* ------------------------------------------------------------ */
2 /*
3 HTTrack Website Copier, Offline Browser for Windows and Unix
4 Copyright (C) 1998-2017 Xavier Roche and other contributors
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
18
19 Important notes:
20
21 - We hereby ask people using this source NOT to use it in purpose of grabbing
22 emails addresses, or collecting any other private information on persons.
23 This would disgrace our work, and spoil the many hours we spent on it.
24
25 Please visit our Website: http://www.httrack.com
26 */
27
28 /* ------------------------------------------------------------ */
29 /* File: httrack.c subroutines: */
30 /* savename routine (compute output filename) */
31 /* Author: Xavier Roche */
32 /* ------------------------------------------------------------ */
33
34 /* Internal engine bytecode */
35 #define HTS_INTERNAL_BYTECODE
36
37 #include "htscore.h"
38 #include "htsname.h"
39 #include "md5.h"
40 #include "htsmd5.h"
41 #include "htstools.h"
42 #include "htscharset.h"
43 #include "htsencoding.h"
44 #include <ctype.h>
45
46 #define ADD_STANDARD_PATH \
47 { /* ajout nom */\
48 char BIGSTK buff[HTS_URLMAXSIZE*2];\
49 buff[0]='\0';\
50 strncatbuff(buff,start_pos,nom_pos - start_pos);\
51 url_savename_addstr(afs->save, buff);\
52 }
53
54 #define ADD_STANDARD_NAME(shortname) \
55 { /* ajout nom */\
56 char BIGSTK buff[HTS_URLMAXSIZE*2];\
57 standard_name(buff,dot_pos,nom_pos,fil_complete,(shortname));\
58 url_savename_addstr(afs->save, buff);\
59 }
60
61 /* Avoid stupid DOS system folders/file such as 'nul' */
62 /* Based on linux/fs/umsdos/mangle.c */
63 static const char *hts_tbdev[] = {
64 "/prn", "/con", "/aux", "/nul",
65 "/lpt1", "/lpt2", "/lpt3", "/lpt4",
66 "/com1", "/com2", "/com3", "/com4",
67 "/clock$",
68 "/emmxxxx0", "/xmsxxxx0", "/setverxx",
69 ""
70 };
71
72 #define URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET() do { \
73 int prev = opt->state._hts_in_html_parsing; \
74 while(back_pluggable_sockets_strict(sback, opt) <= 0) { \
75 opt->state. _hts_in_html_parsing = 6; \
76 /* Wait .. */ \
77 back_wait(sback,opt,cache,0); \
78 /* Transfer rate */ \
79 engine_stats(); \
80 /* Refresh various stats */ \
81 HTS_STAT.stat_nsocket=back_nsoc(sback); \
82 HTS_STAT.stat_errors=fspc(opt,NULL,"error"); \
83 HTS_STAT.stat_warnings=fspc(opt,NULL,"warning"); \
84 HTS_STAT.stat_infos=fspc(opt,NULL,"info"); \
85 HTS_STAT.nbk=backlinks_done(sback,opt->liens,opt->lien_tot,ptr); \
86 HTS_STAT.nb=back_transferred(HTS_STAT.stat_bytes,sback); \
87 /* Check */ \
88 { \
89 if (!RUN_CALLBACK7(opt, loop, sback->lnk, sback->count,-1,ptr,opt->lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) { \
90 return -1; \
91 } \
92 } \
93 } \
94 opt->state._hts_in_html_parsing = prev; \
95 } while(0)
96
97 /* Strip all // */
cleanDoubleSlash(char * s)98 static void cleanDoubleSlash(char *s) {
99 int i, j;
100
101 for(i = 0, j = 0; s[i] != '\0'; i++) {
102 if (s[i] == '/' && i != 0 && s[i - 1] == '/') {
103 continue;
104 }
105 if (i != j) {
106 s[j] = s[i];
107 }
108 j++;
109 }
110 // terminating \0
111 if (i != j) {
112 s[j] = s[i];
113 }
114 }
115
116 /* Strip all ending . or ' ' (windows-forbidden) */
cleanEndingSpaceOrDot(char * s)117 static void cleanEndingSpaceOrDot(char *s) {
118 int i, j, lastWriteEnd;
119
120 for(i = 0, j = 0, lastWriteEnd = 0; i == 0 || s[i - 1] != '\0'; i++) {
121 if (s[i] == '/' || s[i] == '\0') {
122 // Last write was not good, revert
123 if (j != lastWriteEnd) {
124 j = lastWriteEnd;
125 }
126 }
127
128 if (i != j) {
129 s[j] = s[i];
130 }
131 j++;
132
133 // Commit good candidate for terminating character
134 if (s[i] != ' ' && s[i] != '.') {
135 lastWriteEnd = j;
136 }
137 }
138 }
139
140 // forme le nom du fichier à sauver (save) à partir de fil et adr
141 // système intelligent, qui renomme en cas de besoin (exemple: deux INDEX.HTML et index.html)
url_savename(lien_adrfilsave * const afs,lien_adrfil * const former,const char * referer_adr,const char * referer_fil,httrackp * opt,struct_back * sback,cache_back * cache,hash_struct * hash,int ptr,int numero_passe,const lien_back * headers)142 int url_savename(lien_adrfilsave *const afs,
143 lien_adrfil *const former,
144 const char *referer_adr, const char *referer_fil,
145 httrackp * opt, struct_back * sback, cache_back * cache,
146 hash_struct * hash, int ptr, int numero_passe,
147 const lien_back * headers) {
148 char catbuff[CATBUFF_SIZE];
149 const int is_redirect = headers != NULL && HTTP_IS_REDIRECT(headers->r.statuscode);
150 const char *mime_type = headers != NULL && !is_redirect ? headers->r.contenttype : NULL;
151 /*const char* mime_type = ( headers && HTTP_IS_OK(headers->r.statuscode) ) ? headers->r.contenttype : NULL; */
152 lien_back *const back = sback->lnk;
153
154 /* */
155 char BIGSTK fil[HTS_URLMAXSIZE * 2]; /* ="" */
156
157 const char *const adr_complete = afs->af.adr;
158 const char *const fil_complete = afs->af.fil;
159
160 /*char BIGSTK normadr_[HTS_URLMAXSIZE*2]; */
161 char BIGSTK normadr_[HTS_URLMAXSIZE * 2], normfil_[HTS_URLMAXSIZE * 2];
162 enum { PROTOCOL_HTTP, PROTOCOL_HTTPS, PROTOCOL_FTP, PROTOCOL_FILE,
163 PROTOCOL_UNKNOWN };
164 static const char *protocol_str[] =
165 { "http", "https", "ftp", "file", "unknown" };
166 int protocol = PROTOCOL_HTTP;
167 const char *const adr = jump_identification_const(adr_complete);
168 // copy of fil, used for lookups (see urlhack)
169 const char *normadr = adr;
170 const char *normfil = fil_complete;
171 const char *const print_adr = jump_protocol_const(adr);
172 const char *start_pos = NULL, *nom_pos = NULL, *dot_pos = NULL; // Position nom et point
173
174 // pour changement d'extension ou de nom (content-disposition)
175 int ext_chg = 0, ext_chg_delayed = 0;
176 int is_html = 0;
177 char ext[256];
178 int max_char = 0;
179
180 //CLEAR
181 fil[0] = ext[0] = '\0';
182 afs->save[0] = '\0';
183
184 /* 8-3 ? */
185 switch (opt->savename_83) {
186 case 1: // 8-3
187 max_char = 8;
188 break;
189 case 2: // Level 2 File names may be up to 31 characters.
190 max_char = 31;
191 break;
192 default:
193 max_char = 8;
194 break;
195 }
196
197 // normalize the URL:
198 // www.foo.com -> foo.com
199 // www-42.foo.com -> foo.com
200 // foo.com/bar//foobar -> foo.com/bar/foobar
201 if (opt->urlhack) {
202 // copy of adr (without protocol), used for lookups (see urlhack)
203 normadr = adr_normalized(adr, normadr_);
204 normfil = fil_normalized(fil_complete, normfil_);
205 } else {
206 if (link_has_authority(adr_complete)) { // https or other protocols : in "http/" subfolder
207 char *pos = strchr(adr_complete, ':');
208
209 if (pos != NULL) {
210 normadr_[0] = '\0';
211 strncatbuff(normadr_, adr_complete, (int) (pos - adr_complete));
212 strcatbuff(normadr_, "://");
213 strcatbuff(normadr_, normadr);
214 normadr = normadr_;
215 }
216 }
217 }
218
219 // à afficher sans ftp://
220 if (strfield(adr_complete, "https:")) {
221 protocol = PROTOCOL_HTTPS;
222 } else if (strfield(adr_complete, "ftp:")) {
223 protocol = PROTOCOL_FTP;
224 } else if (strfield(adr_complete, "file:")) {
225 protocol = PROTOCOL_FILE;
226 } else {
227 protocol = PROTOCOL_HTTP;
228 }
229
230 // court-circuit pour lien primaire
231 if (strnotempty(adr) == 0) {
232 if (strcmp(fil_complete, "primary") == 0) {
233 strcatbuff(afs->save, "primary.html");
234 return 0;
235 }
236 }
237
238 /* Declare adr (IDNA-decoded if necessary) */
239 #define DECLARE_ADR(FINAL_ADR) \
240 char *idna_adr =\
241 /* http or https */\
242 (\
243 protocol == PROTOCOL_HTTP\
244 || protocol == PROTOCOL_HTTPS \
245 )\
246 /* and contains IDNA */\
247 && hts_isStringIDNA(adr_complete, strlen(print_adr))\
248 ? hts_convertStringIDNAToUTF8(print_adr, strlen(print_adr))\
249 : NULL;\
250 const char *const FINAL_ADR = idna_adr != NULL \
251 ? idna_adr : ( protocol == PROTOCOL_FILE ? "file" : print_adr )
252
253 /* Release adr */
254 #define RELEASE_ADR() do {\
255 if (idna_adr != NULL) {\
256 free(idna_adr);\
257 idna_adr = NULL;\
258 }\
259 } while(0)
260
261 // vérifier que le nom n'a pas déja été calculé (si oui le renvoyer tel que)
262 // vérifier que le nom n'est pas déja pris...
263 // NOTE: si on cherche /toto/ et que /toto est trouvé on le prend (et réciproquqment) ** // **
264 if (opt->liens != NULL) {
265 int i;
266
267 i = hash_read(hash, normadr, normfil, HASH_STRUCT_ADR_PATH); // recherche table 1 (adr+fil)
268 if (i >= 0) { // ok, trouvé
269 strcpybuff(afs->save, heap(i)->sav);
270 return 0;
271 }
272 i = hash_read(hash, normadr, normfil, HASH_STRUCT_ORIGINAL_ADR_PATH); // recherche table 2 (former->adr+former->fil)
273 if (i >= 0) { // ok, trouvé
274 // copier location moved!
275 strcpybuff(afs->af.adr, heap(i)->adr);
276 strcpybuff(afs->af.fil, heap(i)->fil);
277 // et save
278 strcpybuff(afs->save, heap(i)->sav); // copier (formé à partir du nouveau lien!)
279 return 0;
280 }
281 // chercher sans / ou avec / dans former
282 {
283 char BIGSTK fil_complete_patche[HTS_URLMAXSIZE * 2];
284
285 strcpybuff(fil_complete_patche, normfil);
286 // Version avec ou sans /
287 if (fil_complete_patche[strlen(fil_complete_patche) - 1] == '/')
288 fil_complete_patche[strlen(fil_complete_patche) - 1] = '\0';
289 else
290 strcatbuff(fil_complete_patche, "/");
291 i = hash_read(hash, normadr, fil_complete_patche, HASH_STRUCT_ORIGINAL_ADR_PATH); // recherche table 2 (former->adr+former->fil)
292 if (i >= 0) {
293 // écraser fil et adr (pas former->fil?????)
294 strcpybuff(afs->af.adr, heap(i)->adr);
295 strcpybuff(afs->af.fil, heap(i)->fil);
296 // écrire save
297 strcpybuff(afs->save, heap(i)->sav);
298 return 0;
299 }
300 }
301 }
302 // vérifier la non présence de paramètres dans le nom de fichier
303 // si il y en a, les supprimer (ex: truc.cgi?subj=aspirateur)
304 // néanmoins, gardé pour vérifier la non duplication (voir après)
305 {
306 char *a;
307
308 a = strchr(fil_complete, '?');
309 if (a != NULL) {
310 strncatbuff(fil, fil_complete, a - fil_complete);
311 } else {
312 strcpybuff(fil, fil_complete);
313 }
314 }
315
316 // decode remaining % (normally not necessary; already done in htsparse.c)
317 // this will NOT decode buggy %xx (ie. not UTF-8) ones
318 if (hts_unescapeUrl(fil, catbuff, sizeof(catbuff)) == 0) {
319 strcpybuff(fil, catbuff);
320 } else {
321 hts_log_print(opt, LOG_WARNING,
322 "could not URL-decode string '%s'", fil);
323 }
324
325 /* replace shtml to html.. */
326 if (opt->savename_delayed == 2)
327 is_html = -1; /* ALWAYS delay type */
328 else
329 is_html = ishtml(opt, fil);
330 switch (is_html) { /* .html,.shtml,.. */
331 case 1:
332 if ((strfield2(get_ext(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), fil), "html") == 0)
333 && (strfield2(get_ext(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), fil), "htm") == 0)
334 ) {
335 strcpybuff(ext, "html");
336 ext_chg = 1;
337 }
338 break;
339 case 0:
340 if (!strnotempty(ext)) {
341 if (is_userknowntype(opt, fil)) { // mime known by user
342 char BIGSTK mime[1024];
343
344 mime[0] = ext[0] = '\0';
345 get_userhttptype(opt, mime, fil);
346 if (strnotempty(mime)) {
347 give_mimext(ext, mime);
348 if (strnotempty(ext)) {
349 ext_chg = 1;
350 }
351 }
352 }
353 }
354 break;
355 }
356
357 // si option check_type activée
358 if (is_html < 0 && opt->check_type && !ext_chg) {
359 int ishtest = 0;
360
361 if (protocol != PROTOCOL_FILE
362 && protocol != PROTOCOL_FTP
363 ) {
364 // tester type avec requète HEAD si on ne connait pas le type du fichier
365 if (!((opt->check_type == 1) && (fil[strlen(fil) - 1] == '/'))) // slash doit être html?
366 if (opt->savename_delayed == 2 || (ishtest = ishtml(opt, fil)) < 0) { // on ne sait pas si c'est un html ou un fichier..
367 // lire dans le cache
368 htsblk r = cache_read_including_broken(opt, cache, adr, fil); // test uniquement
369
370 if (r.statuscode != -1) { // pas d'erreur de lecture cache
371 char s[32];
372
373 s[0] = '\0';
374 hts_log_print(opt, LOG_DEBUG, "Testing link type (from cache) %s%s",
375 adr_complete, fil_complete);
376 if (!HTTP_IS_REDIRECT(r.statuscode)) {
377 if (strnotempty(r.cdispo)) { /* filename given */
378 ext_chg = 2; /* change filename */
379 strcpybuff(ext, r.cdispo);
380 } else if (!may_unknown2(opt, r.contenttype, fil)) { // on peut patcher à priori?
381 give_mimext(s, r.contenttype); // obtenir extension
382 if (strnotempty(s) > 0) { // on a reconnu l'extension
383 ext_chg = 1;
384 strcpybuff(ext, s);
385 }
386 }
387 }
388 #ifdef DEFAULT_BIN_EXT
389 // no extension and potentially bogus
390 else if (ishtest == -2) {
391 ext_chg = 1;
392 strcpybuff(ext, DEFAULT_BIN_EXT + 1);
393 }
394 #endif
395 //
396 } else if (opt->savename_delayed != 2 && is_userknowntype(opt, fil)) { /* PATCH BY BRIAN SCHRÖDER.
397 Lookup mimetype not only by extension,
398 but also by filename */
399 /* Note: "foo.cgi => text/html" means that foo.cgi shall have the text/html MIME file type,
400 that is, ".html" */
401 char BIGSTK mime[1024];
402
403 mime[0] = ext[0] = '\0';
404 get_userhttptype(opt, mime, fil);
405 if (strnotempty(mime)) {
406 give_mimext(ext, mime);
407 if (strnotempty(ext)) {
408 ext_chg = 1;
409 }
410 }
411 }
412 // note: if savename_delayed is enabled, the naming will be temporary (and slightly invalid!)
413 // note: if we are about to stop (opt->state.stop), back_add() will fail later
414 else if (opt->savename_delayed != 0 && !opt->state.stop) {
415 // Check if the file is ready in backing. We basically take the same logic as later.
416 // FIXME: we should cleanup and factorize this unholy mess
417 if (headers != NULL && headers->status >= 0 && !is_redirect) {
418 if (strnotempty(headers->r.cdispo)) { /* filename given */
419 ext_chg = 2; /* change filename */
420 strcpybuff(ext, headers->r.cdispo);
421 } else if (!may_unknown2(opt, headers->r.contenttype, headers->url_fil)) { // on peut patcher à priori? (pas interdit ou pas de type)
422 char s[16];
423 s[0] = '\0';
424 give_mimext(s, headers->r.contenttype); // obtenir extension
425 if (strnotempty(s) > 0) { // on a reconnu l'extension
426 ext_chg = 1;
427 strcpybuff(ext, s);
428 }
429 }
430 }
431 else if (mime_type != NULL) {
432 ext[0] = '\0';
433 if (*mime_type) {
434 give_mimext(ext, mime_type);
435 }
436 if (strnotempty(ext)) {
437 char mime_from_file[128];
438
439 mime_from_file[0] = 0;
440 get_httptype(opt, mime_from_file, fil, 1);
441 if (!strnotempty(mime_from_file) || strcasecmp(mime_type, mime_from_file) != 0) { /* different mime for this type */
442 /* type change not forbidden (or no extension at all) */
443 if (!may_unknown2(opt, mime_type, fil)) {
444 ext_chg = 1;
445 }
446 #ifdef DEFAULT_BIN_EXT
447 // no extension and potentially bogus
448 else if (ishtml(opt, fil) == -2) {
449 ext_chg = 1;
450 strcpybuff(ext, DEFAULT_BIN_EXT + 1);
451 }
452 #endif
453 } else {
454 ext_chg = 0;
455 }
456 }
457 } else {
458 /* Avoid collisions (no collisionning detection) */
459 sprintf(ext, "%x.%s", opt->state.delayedId++, DELAYED_EXT);
460 ext_chg = 1;
461 ext_chg_delayed = 1; /* due to naming system */
462 }
463 }
464 // test imposible dans le cache, faire une requête
465 else {
466 //
467 int hihp = opt->state._hts_in_html_parsing;
468 int has_been_moved = 0;
469 lien_adrfil current;
470
471 /* Ensure we don't use too many sockets by using a "testing" one
472 If we have only 1 simultaneous connection authorized, wait for pending download
473 Wait for an available slot
474 */
475 URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET();
476
477 /* Rock'in */
478 current.adr[0] = current.fil[0] = '\0';
479 opt->state._hts_in_html_parsing = 2; // test
480 hts_log_print(opt, LOG_DEBUG, "Testing link type %s%s",
481 adr_complete, fil_complete);
482 strcpybuff(current.adr, adr_complete);
483 strcpybuff(current.fil, fil_complete);
484 // ajouter dans le backing le fichier en mode test
485 // savename: rien car en mode test
486 if (back_add
487 (sback, opt, cache, current.adr, current.fil, BACK_ADD_TEST,
488 referer_adr, referer_fil, 1) != -1) {
489 int b;
490
491 b = back_index(opt, sback, current.adr, current.fil, BACK_ADD_TEST);
492 if (b >= 0) {
493 int stop_looping = 0;
494 int petits_tours = 0;
495 int get_test_request = 0; // en cas de bouclage sur soi même avec HEAD, tester avec GET.. parfois c'est la cause des problèmes
496
497 do {
498 // temps à attendre, et remplir autant que l'on peut le cache (backing)
499 if (back[b].status > 0) {
500 back_wait(sback, opt, cache, 0);
501 }
502 if (ptr >= 0) {
503 back_fillmax(sback, opt, cache, ptr, numero_passe);
504 }
505 // on est obligé d'appeler le shell pour le refresh..
506 // Transfer rate
507 engine_stats();
508
509 // Refresh various stats
510 HTS_STAT.stat_nsocket = back_nsoc(sback);
511 HTS_STAT.stat_errors = fspc(opt, NULL, "error");
512 HTS_STAT.stat_warnings = fspc(opt, NULL, "warning");
513 HTS_STAT.stat_infos = fspc(opt, NULL, "info");
514 HTS_STAT.nbk = backlinks_done(sback, opt->liens, opt->lien_tot, ptr);
515 HTS_STAT.nb = back_transferred(HTS_STAT.stat_bytes, sback);
516
517 if (!RUN_CALLBACK7
518 (opt, loop, sback->lnk, sback->count, b, ptr, opt->lien_tot,
519 (int) (time_local() - HTS_STAT.stat_timestart),
520 &HTS_STAT)) {
521 return -1;
522 } else if (opt->state._hts_cancel || !back_checkmirror(opt)) { // cancel 2 ou 1 (cancel parsing)
523 back_delete(opt, cache, sback, b); // cancel test
524 stop_looping = 1;
525 }
526 // traitement des 304,303..
527 if (back[b].status <= 0) {
528 if (HTTP_IS_REDIRECT(back[b].r.statuscode)) { // agh moved.. un tit tour de plus
529 if ((petits_tours < 5) && former != NULL) { // on va pas tourner en rond non plus!
530 if (strnotempty(back[b].r.location)) { // location existe!
531 char BIGSTK mov_url[HTS_URLMAXSIZE * 2];
532 lien_adrfil moved;
533 mov_url[0] = moved.adr[0] = moved.fil[0] = '\0';
534 //
535 strcpybuff(mov_url, back[b].r.location); // copier URL
536 if (ident_url_relatif
537 (mov_url, current.adr, current.fil, &moved) >= 0) {
538 // si non bouclage sur soi même, ou si test avec GET non testé
539 if ((strcmp(moved.adr, current.adr))
540 || (strcmp(moved.fil, current.fil))
541 || (get_test_request == 0)) {
542 // bouclage?
543 if ((!strcmp(moved.adr, current.adr))
544 && (!strcmp(moved.fil, current.fil)))
545 get_test_request = 1; // faire requète avec GET
546
547 // recopier former->adr/fil?
548 if (former != NULL) {
549 if (strnotempty(former->adr) == 0) { // Pas déja noté
550 strcpybuff(former->adr, current.adr);
551 strcpybuff(former->fil, current.fil);
552 }
553 }
554 // check explicit forbidden - don't follow 3xx in this case
555 {
556 int set_prio_to = 0;
557
558 if (hts_acceptlink(opt, ptr, moved.adr, moved.fil, NULL, NULL, &set_prio_to, NULL) == 1) { /* forbidden */
559 has_been_moved = 1;
560 back_maydelete(opt, cache, sback, b); // ok
561 strcpybuff(current.adr, moved.adr);
562 strcpybuff(current.fil, moved.fil);
563 mov_url[0] = '\0';
564 stop_looping = 1;
565 }
566 }
567
568 // ftp: stop!
569 if (strfield(mov_url, "ftp://")
570 ) { // ftp, ok on arrête
571 has_been_moved = 1;
572 back_maydelete(opt, cache, sback, b); // ok
573 strcpybuff(current.adr, moved.adr);
574 strcpybuff(current.fil, moved.fil);
575 stop_looping = 1;
576 } else if (*mov_url) {
577 const char *methode;
578
579 if (!get_test_request)
580 methode = BACK_ADD_TEST; // tester avec HEAD
581 else {
582 methode = BACK_ADD_TEST2; // tester avec GET
583 hts_log_print(opt, LOG_WARNING,
584 "Loop with HEAD request (during prefetch) at %s%s",
585 current.adr, current.fil);
586 }
587 // Ajouter
588 URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET();
589 if (back_add(sback, opt, cache, moved.adr, moved.fil, methode, referer_adr, referer_fil, 1) != -1) { // OK
590 hts_log_print(opt, LOG_DEBUG,
591 "(during prefetch) %s (%d) to link %s at %s%s",
592 back[b].r.msg,
593 back[b].r.statuscode,
594 back[b].r.location, current.adr,
595 current.fil);
596
597 // libérer emplacement backing actuel et attendre le prochain
598 back_maydelete(opt, cache, sback, b);
599 strcpybuff(current.adr, moved.adr);
600 strcpybuff(current.fil, moved.fil);
601 b =
602 back_index(opt, sback, current.adr, current.fil,
603 methode);
604 if (!get_test_request)
605 has_been_moved = 1; // sinon ne pas forcer has_been_moved car non déplacé
606 petits_tours++;
607 //
608 } else { // sinon on fait rien et on s'en va.. (ftp etc)
609 hts_log_print(opt, LOG_DEBUG,
610 "Warning: Savename redirect backing error at %s%s",
611 moved.adr, moved.fil);
612 }
613 }
614 } else {
615 hts_log_print(opt, LOG_WARNING,
616 "Unable to test %s%s (loop to same filename)",
617 adr_complete, fil_complete);
618 }
619
620 }
621 }
622 } else { // arrêter les frais
623 hts_log_print(opt, LOG_WARNING,
624 "Unable to test %s%s (loop)",
625 adr_complete, fil_complete);
626 }
627 } // ok, leaving
628 }
629 } while(!stop_looping && back[b].status > 0
630 && back[b].status < 1000);
631
632 // Si non déplacé, forcer type?
633 if (!has_been_moved) {
634 if (back[b].r.statuscode != -10) { // erreur
635 if (strnotempty(back[b].r.contenttype) == 0)
636 strcpybuff(back[b].r.contenttype, "text/html"); // message d'erreur en html
637 // Finalement on, renvoie un erreur, pour ne toucher à rien dans le code
638 // libérer emplacement backing
639 }
640
641 { // pas d'erreur, changer type?
642 char s[16];
643
644 s[0] = '\0';
645 if (strnotempty(back[b].r.cdispo)) { /* filename given */
646 ext_chg = 2; /* change filename */
647 strcpybuff(ext, back[b].r.cdispo);
648 } else if (!may_unknown2(opt, back[b].r.contenttype, back[b].url_fil)) { // on peut patcher à priori? (pas interdit ou pas de type)
649 give_mimext(s, back[b].r.contenttype); // obtenir extension
650 if (strnotempty(s) > 0) { // on a reconnu l'extension
651 ext_chg = 1;
652 strcpybuff(ext, s);
653 }
654 }
655 #ifdef DEFAULT_BIN_EXT
656 // no extension and potentially bogus
657 else if (ishtest == -2) {
658 ext_chg = 1;
659 strcpybuff(ext, DEFAULT_BIN_EXT + 1);
660 }
661 #endif
662 }
663 }
664 // FIN Si non déplacé, forcer type?
665
666 // libérer emplacement backing
667 back_maydelete(opt, cache, sback, b);
668
669 // --- --- ---
670 // oops, a été déplacé.. on recalcule en récursif (osons!)
671 if (has_been_moved) {
672 // copier adr, fil (optionnel, mais sinon marche pas pour le rip)
673 strcpybuff(afs->af.adr, current.adr);
674 strcpybuff(afs->af.fil, current.fil);
675 // copier adr, fil
676
677 return url_savename(afs, NULL,
678 referer_adr, referer_fil, opt,
679 sback, cache, hash, ptr,
680 numero_passe, NULL);
681 }
682 // --- --- ---
683
684 }
685
686 } else {
687 printf
688 ("PANIC! : Savename Crash adding error, unexpected error found.. [%d]\n",
689 __LINE__);
690 #if BDEBUG==1
691 printf("error while savename crash adding\n");
692 #endif
693 hts_log_print(opt, LOG_ERROR,
694 "Unexpected savename backing error at %s%s", adr,
695 fil_complete);
696
697 }
698 // restaurer
699 opt->state._hts_in_html_parsing = hihp;
700 } // caché?
701 }
702 }
703 }
704
705 // - - - DEBUT NOMMAGE - - -
706
707 // Donner nom par défaut?
708 if (fil[strlen(fil) - 1] == '/') {
709 if (!strfield(adr_complete, "ftp://")
710 ) {
711 strcatbuff(fil, DEFAULT_HTML); // nommer page par défaut!!
712 } else {
713 if (!opt->proxy.active)
714 strcatbuff(fil, DEFAULT_FTP); // nommer page par défaut (texte)
715 else
716 strcatbuff(fil, DEFAULT_HTML); // nommer page par défaut (à priori ici html depuis un proxy http)
717 }
718 }
719 // Changer extension?
720 // par exemple, php3 sera sauvé en html, cgi en html ou gif, xbm etc.. selon les cas
721 if (ext_chg && !opt->no_type_change) { // changer ext
722 char *a = fil + strlen(fil) - 1;
723
724 if ((opt->debug > 1) && (opt->log != NULL)) {
725 if (ext_chg == 1)
726 hts_log_print(opt, LOG_DEBUG, "Changing link extension %s%s to .%s",
727 adr_complete, fil_complete, ext);
728 else
729 hts_log_print(opt, LOG_DEBUG, "Changing link name %s%s to %s",
730 adr_complete, fil_complete, ext);
731 }
732 if (ext_chg == 1) {
733 while((a > fil) && (*a != '.') && (*a != '/'))
734 a--;
735 if (*a == '.')
736 *a = '\0'; // couper
737 strcatbuff(fil, "."); // recopier point
738 } else {
739 while((a > fil) && (*a != '/'))
740 a--;
741 if (*a == '/')
742 a++;
743 *a = '\0';
744 }
745 strcatbuff(fil, ext); // copier ext/nom
746 }
747 // Rechercher premier / et dernier .
748 {
749 const char *a = fil + strlen(fil) - 1;
750
751 // passer structures
752 start_pos = fil;
753 while((a > fil) && (*a != '/') && (*a != '\\')) {
754 if (*a == '.') // point? noter position
755 if (!dot_pos)
756 dot_pos = a;
757 a--;
758 }
759 if ((*a == '/') || (*a == '\\'))
760 a++;
761 nom_pos = a;
762 }
763
764 // un nom de fichier est généré
765 // s'il existe déja, alors on le mofifie légèrement
766
767 // ajouter nom du site éventuellement en premier
768 if (opt->savename_type == -1) { // utiliser savename_userdef! (%h%p/%n%q.%t)
769 const char *a = StringBuff(opt->savename_userdef);
770 char *b = afs->save;
771
772 /*char *nom_pos=NULL,*dot_pos=NULL; // Position nom et point */
773 char tok;
774
775 /*
776 { // Rechercher premier /
777 char* a=fil+strlen(fil)-1;
778 // passer structures
779 while(((int) a>(int) fil) && (*a != '/') && (*a != '\\')) {
780 if (*a == '.') // point? noter position
781 if (!dot_pos)
782 dot_pos=a;
783 a--;
784 }
785 if ((*a=='/') || (*a=='\\')) a++;
786 nom_pos = a;
787 }
788 */
789
790 // Construire nom
791 while((*a) && (((int) (b - afs->save)) < HTS_URLMAXSIZE)) { // parser, et pas trop long..
792 if (*a == '%') {
793 int short_ver = 0;
794
795 a++;
796 if (*a == 's') {
797 short_ver = 1;
798 a++;
799 }
800 *b = '\0';
801 switch (tok = *a++) {
802 case '[': // %[param:prefix_if_not_empty:suffix_if_not_empty:empty_replacement:notfound_replacement]
803 if (strchr(a, ']')) {
804 int pos = 0;
805 char name[5][256];
806 char *c = name[0];
807
808 for(pos = 0; pos < 5; pos++) {
809 name[pos][0] = '\0';
810 }
811 pos = 0;
812 while(*a != '\0' && *a != ']') {
813 if (pos < 5) {
814 if (*a == ':') { // next token
815 c = name[++pos];
816 a++;
817 } else {
818 *c++ = *a++;
819 *c = '\0';
820 }
821 }
822 }
823 if (*a == ']') {
824 a++;
825 }
826 strcatbuff(name[0], "="); /* param=.. */
827 c = strchr(fil_complete, '?');
828 /* parameters exists */
829 if (c) {
830 char *cp;
831
832 while((cp = strstr(c + 1, name[0])) && *(cp - 1) != '?' && *(cp - 1) != '&') { /* finds [?&]param= */
833 c = cp;
834 }
835 if (cp) {
836 c = cp + strlen(name[0]); /* jumps "param=" */
837 strcpybuff(b, name[1]); /* prefix */
838 b += strlen(b);
839 if (*c != '\0' && *c != '&') {
840 char *d = name[0];
841
842 /* */
843 while(*c != '\0' && *c != '&') {
844 *d++ = *c++;
845 }
846 *d = '\0';
847 d = unescape_http(catbuff, sizeof(catbuff), name[0]);
848 if (d && *d) {
849 strcpybuff(b, d); /* value */
850 b += strlen(b);
851 } else {
852 strcpybuff(b, name[3]); /* empty replacement if any */
853 b += strlen(b);
854 }
855 } else {
856 strcpybuff(b, name[3]); /* empty replacement if any */
857 b += strlen(b);
858 }
859 strcpybuff(b, name[2]); /* suffix */
860 b += strlen(b);
861 } else {
862 strcpybuff(b, name[4]); /* not found replacement if any */
863 b += strlen(b);
864 }
865 } else {
866 strcpybuff(b, name[4]); /* not found replacement if any */
867 b += strlen(b);
868 }
869 }
870 break;
871 case '%':
872 *b++ = '%';
873 break;
874 case 'n': // nom sans ext
875 *b = '\0';
876 if (dot_pos) {
877 if (!short_ver) // Noms longs
878 strncatbuff(b, nom_pos, (int) (dot_pos - nom_pos));
879 else
880 strncatbuff(b, nom_pos, min((int) (dot_pos - nom_pos), 8));
881 } else {
882 if (!short_ver) // Noms longs
883 strcpybuff(b, nom_pos);
884 else
885 strncatbuff(b, nom_pos, 8);
886 }
887 b += strlen(b); // pointer à la fin
888 break;
889 case 'N': // nom avec ext
890 // RECOPIE NOM + EXT
891 *b = '\0';
892 if (dot_pos) {
893 if (!short_ver) // Noms longs
894 strncatbuff(b, nom_pos, (int) (dot_pos - nom_pos));
895 else
896 strncatbuff(b, nom_pos, min((int) (dot_pos - nom_pos), 8));
897 } else {
898 if (!short_ver) // Noms longs
899 strcpybuff(b, nom_pos);
900 else
901 strncatbuff(b, nom_pos, 8);
902 }
903 b += strlen(b); // pointer à la fin
904 // RECOPIE NOM + EXT
905 *b = '\0';
906 if (dot_pos) {
907 if (!short_ver) // Noms longs
908 strcpybuff(b, dot_pos + 1);
909 else
910 strncatbuff(b, dot_pos + 1, 3);
911 } else {
912 if (!short_ver) // Noms longs
913 strcpybuff(b, DEFAULT_EXT + 1); // pas de..
914 else
915 strcpybuff(b, DEFAULT_EXT_SHORT + 1); // pas de..
916 }
917 b += strlen(b); // pointer à la fin
918 //
919 break;
920 case 't': // ext
921 *b = '\0';
922 if (dot_pos) {
923 if (!short_ver) // Noms longs
924 strcpybuff(b, dot_pos + 1);
925 else
926 strncatbuff(b, dot_pos + 1, 3);
927 } else {
928 if (!short_ver) // Noms longs
929 strcpybuff(b, DEFAULT_EXT + 1); // pas de..
930 else
931 strcpybuff(b, DEFAULT_EXT_SHORT + 1); // pas de..
932 }
933 b += strlen(b); // pointer à la fin
934 break;
935 case 'p': // path sans dernier /
936 *b = '\0';
937 if (nom_pos != fil + 1) { // pas: /index.html (chemin nul)
938 if (!short_ver) { // Noms longs
939 strncatbuff(b, fil, (int) (nom_pos - fil) - 1);
940 } else {
941 char BIGSTK pth[HTS_URLMAXSIZE * 2], n83[HTS_URLMAXSIZE * 2];
942
943 pth[0] = n83[0] = '\0';
944 //
945 strncatbuff(pth, fil, (int) (nom_pos - fil) - 1);
946 long_to_83(opt->savename_83, n83, pth);
947 strcpybuff(b, n83);
948 }
949 }
950 b += strlen(b); // pointer à la fin
951 break;
952 case 'h': // host (IDNA decoded if suitable)
953 // IDNA / RFC 3492 (Punycode) handling for HTTP(s)
954 {
955 DECLARE_ADR(final_adr);
956
957 /* Copy address */
958 *b = '\0';
959 if (!short_ver)
960 strcpybuff(b, final_adr);
961 else
962 strcpybuff(b, final_adr);
963
964 /* release */
965 RELEASE_ADR();
966 }
967 b += strlen(b); // pointer à la fin
968 break;
969 case 'H': // host, raw (old mode)
970 *b = '\0';
971 if (protocol == PROTOCOL_FILE) {
972 if (!short_ver) // Noms longs
973 strcpybuff(b, "localhost");
974 else
975 strcpybuff(b, "local");
976 } else {
977 if (!short_ver) // Noms longs
978 strcpybuff(b, print_adr);
979 else
980 strncatbuff(b, print_adr, 8);
981 }
982 b += strlen(b); // pointer à la fin
983 break;
984 case 'M': /* host/address?query MD5 (128-bits) */
985 *b = '\0';
986 {
987 char digest[32 + 2];
988 char BIGSTK buff[HTS_URLMAXSIZE * 2];
989
990 digest[0] = buff[0] = '\0';
991 strcpybuff(buff, adr);
992 strcatbuff(buff, fil_complete);
993 domd5mem(buff, strlen(buff), digest, 1);
994 strcpybuff(b, digest);
995 }
996 b += strlen(b); // pointer à la fin
997 break;
998 case 'Q':
999 case 'q': /* query MD5 (128-bits/16-bits)
1000 GENERATED ONLY IF query string exists! */
1001 {
1002 char md5[32 + 2];
1003
1004 *b = '\0';
1005 strncatbuff(b, url_md5(md5, fil_complete), (tok == 'Q') ? 32 : 4);
1006 b += strlen(b); // pointer à la fin
1007 }
1008 break;
1009 case 'r':
1010 case 'R': // protocol
1011 *b = '\0';
1012 strcatbuff(b, protocol_str[protocol]);
1013 b += strlen(b); // pointer à la fin
1014 break;
1015
1016 /* Patch by Juan Fco Rodriguez to get the full query string */
1017 case 'k':
1018 {
1019 char *d = strchr(fil_complete, '?');
1020
1021 if (d != NULL) {
1022 strcatbuff(b, d);
1023 b += strlen(b);
1024 }
1025 }
1026 break;
1027
1028 }
1029 } else
1030 *b++ = *a++;
1031 }
1032 *b++ = '\0';
1033 //
1034 // Types prédéfinis
1035 //
1036
1037 }
1038 //
1039 // Structure originale
1040 else if (opt->savename_type % 100 == 0) {
1041 /* recopier www.. */
1042 if (opt->savename_type != 100) {
1043 if (((opt->savename_type / 1000) % 2) == 0) { // >1000 signifie "pas de www/"
1044 DECLARE_ADR(final_adr);
1045
1046 // adresse url
1047 if (!opt->savename_83) { // noms longs (et pas de .)
1048 strcatbuff(afs->save, final_adr);
1049 } else { // noms 8-3
1050 if (strlen(final_adr) > 4) {
1051 if (strfield(final_adr, "www."))
1052 hts_appendStringUTF8(afs->save, final_adr + 4, max_char);
1053 else
1054 hts_appendStringUTF8(afs->save, final_adr, max_char);
1055 } else
1056 hts_appendStringUTF8(afs->save, final_adr, max_char);
1057 }
1058
1059 /* release */
1060 RELEASE_ADR();
1061
1062 if (*fil != '/')
1063 strcatbuff(afs->save, "/");
1064 }
1065 }
1066
1067 hts_lowcase(afs->save);
1068
1069 /*
1070 // ne sert à rien car a déja été filtré normalement
1071 if ((*fil=='.') && (*(fil+1)=='/')) // ./index.html ** //
1072 url_savename_addstr(save,fil+2);
1073 else // index.html ou /index.html
1074 url_savename_addstr(save,fil);
1075 if (save[strlen(save)-1]=='/')
1076 strcatbuff(save,DEFAULT_HTML); // nommer page par défaut!!
1077 */
1078
1079 /* add name */
1080 ADD_STANDARD_PATH;
1081 ADD_STANDARD_NAME(0);
1082
1083 }
1084 //
1085 // Structure html/image
1086 else {
1087 // dossier "web" ou "www.xxx" ?
1088 if (((opt->savename_type / 1000) % 2) == 0) { // >1000 signifie "pas de www/"
1089 if ((opt->savename_type / 100) % 2) {
1090 DECLARE_ADR(final_adr);
1091
1092 if (!opt->savename_83) { // noms longs
1093 strcatbuff(afs->save, final_adr);
1094 strcatbuff(afs->save, "/");
1095 } else { // noms 8-3
1096 if (strlen(final_adr) > 4) {
1097 if (strfield(final_adr, "www."))
1098 hts_appendStringUTF8(afs->save, final_adr + 4, max_char);
1099 else
1100 hts_appendStringUTF8(afs->save, final_adr, max_char);
1101 strcatbuff(afs->save, "/");
1102 } else {
1103 hts_appendStringUTF8(afs->save, final_adr, max_char);
1104 strcatbuff(afs->save, "/");
1105 }
1106 }
1107
1108 /* release */
1109 RELEASE_ADR();
1110 } else {
1111 strcatbuff(afs->save, "web/"); // répertoire général
1112 }
1113 }
1114 // si un html à coup sûr
1115 if ((ext_chg != 0) ? (ishtml_ext(ext) == 1) : (ishtml(opt, fil) == 1)) {
1116 if (opt->savename_type % 100 == 2) { // html/
1117 strcatbuff(afs->save, "html/");
1118 }
1119 } else {
1120 if ((opt->savename_type % 100 == 1) || (opt->savename_type % 100 == 2)) { // html & images
1121 strcatbuff(afs->save, "images/");
1122 }
1123 }
1124
1125 switch (opt->savename_type % 100) {
1126 case 4:
1127 case 5:{ // séparer par types
1128 const char *a = fil + strlen(fil) - 1;
1129
1130 // passer structures
1131 while((a > fil) && (*a != '/') && (*a != '\\'))
1132 a--;
1133 if ((*a == '/') || (*a == '\\'))
1134 a++;
1135
1136 // html?
1137 if ((ext_chg != 0) ? (ishtml_ext(ext) == 1) : (ishtml(opt, fil) == 1)) {
1138 if (opt->savename_type % 100 == 5)
1139 strcatbuff(afs->save, "html/");
1140 } else {
1141 const char *a = fil + strlen(fil) - 1;
1142
1143 while((a > fil) && (*a != '/') && (*a != '.'))
1144 a--;
1145 if (*a != '.')
1146 strcatbuff(afs->save, "other");
1147 else
1148 strcatbuff(afs->save, a + 1);
1149 strcatbuff(afs->save, "/");
1150 }
1151 /*strcatbuff(save,a); */
1152 /* add name */
1153 ADD_STANDARD_NAME(0);
1154 }
1155 break;
1156 case 99:{ // 'codé' .. c'est un gadget
1157 size_t i;
1158 size_t j;
1159 const char *a;
1160 char C[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-";
1161 int L;
1162
1163 // pseudo-CRC sur fil et adr pour initialiser générateur aléatoire..
1164 unsigned int s = 0;
1165
1166 L = (int) strlen(C);
1167 for(i = 0; fil_complete[i] != '\0'; i++) {
1168 s += (unsigned int) fil_complete[i];
1169 }
1170 for(i = 0; adr_complete[i] != '\0'; i++) {
1171 s += (unsigned int) adr_complete[i];
1172 }
1173 srand(s);
1174
1175 j = strlen(afs->save);
1176 for(i = 0; i < 8; i++) {
1177 char c = C[(rand() % L)];
1178
1179 afs->save[i + j] = c;
1180 }
1181 afs->save[i + j] = '\0';
1182 // ajouter extension
1183 a = fil + strlen(fil) - 1;
1184 while((a > fil) && (*a != '/') && (*a != '.'))
1185 a--;
1186 if (*a == '.') {
1187 strcatbuff(afs->save, a); // ajouter
1188 }
1189 }
1190 break;
1191 default:{ // noms sans les noms des répertoires
1192 // ne garder que le nom, pas la structure
1193 /*
1194 char* a=fil+strlen(fil)-1;
1195 while(((int) a>(int) fil) && (*a != '/') && (*a != '\\')) a--;
1196 if ((*a=='/') || (*a=='\\')) a++;
1197 strcatbuff(save,a);
1198 */
1199
1200 /* add name */
1201 ADD_STANDARD_NAME(0);
1202 }
1203 break;
1204 }
1205
1206 hts_lowcase(afs->save);
1207
1208 if (afs->save[strlen(afs->save) - 1] == '/')
1209 strcatbuff(afs->save, DEFAULT_HTML); // nommer page par défaut!!
1210 }
1211
1212 // vérifier qu'on ne doit pas forcer l'extension
1213 // par exemple, asp sera sauvé en html, cgi en html ou gif, xbm etc.. selon les cas
1214 /*if (ext_chg) {
1215 char* a=save+strlen(save)-1;
1216 while(((int) a>(int) save) && (*a!='.') && (*a!='/')) a--;
1217 if (*a=='.') *a='\0'; // couper
1218 // recopier extension
1219 strcatbuff(save,".");
1220 strcatbuff(save,ext); // copier ext
1221 } */
1222
1223 // Not used anymore unless non-delayed types.
1224 // de même en cas de manque d'extension on en place une de manière forcée..
1225 // cela évite les /chez/toto et les /chez/toto/index.html incompatibles
1226 if (opt->savename_type != -1 && opt->savename_delayed != 2) {
1227 char *a = afs->save + strlen(afs->save) - 1;
1228
1229 while((a > afs->save) && (*a != '.') && (*a != '/'))
1230 a--;
1231 if (*a != '.') { // agh pas de point
1232 //strcatbuff(save,".none"); // a éviter
1233 strcatbuff(afs->save, ".html"); // préférable!
1234 hts_log_print(opt, LOG_DEBUG, "Default HTML type set for %s%s => %s",
1235 adr_complete, fil_complete, afs->save);
1236 }
1237 }
1238 // effacer pass au besoin pour les autentifications
1239 // (plus la peine : masqué au début)
1240 /*
1241 {
1242 char* a = jump_identification(afs->save);
1243 if (a!=afs->save) {
1244 char BIGSTK tempo[HTS_URLMAXSIZE*2];
1245 char *b;
1246 tempo[0]='\0';
1247 strcpybuff(tempo,"[");
1248 b=strchr(save,':');
1249 if (!b) b=strchr(save,'@');
1250 if (b)
1251 strncatbuff(tempo,save,(int) b-(int) a);
1252 strcatbuff(tempo,"]");
1253 strcatbuff(tempo,a);
1254 strcpybuff(save,a);
1255 }
1256 }
1257 */
1258
1259 // éviter les / au début (cause: N100)
1260 if (afs->save[0] == '/') {
1261 char BIGSTK tempo[HTS_URLMAXSIZE * 2];
1262
1263 strcpybuff(tempo, afs->save + 1);
1264 strcpybuff(afs->save, tempo);
1265 }
1266
1267 /* Cleanup reserved or forbidden characters. */
1268 {
1269 size_t i;
1270 for(i = 0 ; afs->save[i] != '\0' ; i++) {
1271 unsigned char c = (unsigned char) afs->save[i];
1272 if (c < 32 // control
1273 || c == 127 // unwise
1274 || c == '~' // unix unwise
1275 || c == '\\' // windows separator
1276 || c == ':' // windows forbidden
1277 || c == '*' // windows forbidden
1278 || c == '?' // windows forbidden
1279 || c == '\"' // windows forbidden
1280 || c == '<' // windows forbidden
1281 || c == '>' // windows forbidden
1282 || c == '|' // windows forbidden
1283 //|| c == '@' // ?
1284 ||
1285 (
1286 opt->savename_83 == 2 // CDROM
1287 &&
1288 (
1289 c == '-'
1290 || c == '='
1291 || c == '+'
1292 )
1293 )
1294 )
1295 {
1296 afs->save[i] = '_';
1297 }
1298 }
1299 }
1300
1301 // éliminer les // (comme ftp://)
1302 cleanDoubleSlash(afs->save);
1303
1304 #if HTS_OVERRIDE_DOS_FOLDERS
1305 /* Replace /foo/nul/bar by /foo/nul_/bar */
1306 {
1307 int i = 0;
1308
1309 while(hts_tbdev[i][0]) {
1310 const char *a = afs->save;
1311
1312 while((a = strstrcase(a, hts_tbdev[i]))) {
1313 switch ((int) a[strlen(hts_tbdev[i])]) {
1314 case '\0':
1315 case '/':
1316 case '.':
1317 {
1318 char BIGSTK tempo[HTS_URLMAXSIZE * 2];
1319
1320 tempo[0] = '\0';
1321 strncatbuff(tempo, afs->save, (int) (a - afs->save) + strlen(hts_tbdev[i]));
1322 strcatbuff(tempo, "_");
1323 strcatbuff(tempo, a + strlen(hts_tbdev[i]));
1324 strcpybuff(afs->save, tempo);
1325 }
1326 break;
1327 }
1328 a += strlen(hts_tbdev[i]);
1329 }
1330 i++;
1331 }
1332 }
1333
1334 /* Strip ending . or ' ' forbidden on windoz */
1335 cleanEndingSpaceOrDot(afs->save);
1336
1337 #endif
1338
1339 // conversion 8-3 .. y compris pour les répertoires
1340 if (opt->savename_83) {
1341 char BIGSTK n83[HTS_URLMAXSIZE * 2];
1342
1343 long_to_83(opt->savename_83, n83, afs->save);
1344 strcpybuff(afs->save, n83);
1345 }
1346 // enforce stricter ISO9660 compliance (bug reported by Steffo Carlsson)
1347 // Level 1 File names are restricted to 8 characters with a 3 character extension,
1348 // upper case letters, numbers and underscore; maximum depth of directories is 8.
1349 // This will be our "DOS mode"
1350 // L2: 31 characters
1351 // A-Z,0-9,_
1352 if (opt->savename_83 > 0) {
1353 char *a, *last;
1354
1355 for(last = afs->save + strlen(afs->save) - 1;
1356 last != afs->save && *last != '/' && *last != '\\' && *last != '.'; last--) ;
1357 if (*last != '.') {
1358 last = NULL;
1359 }
1360 for(a = afs->save; *a != '\0'; a++) {
1361 if (*a >= 'a' && *a <= 'z') {
1362 *a -= 'a' - 'A';
1363 } else if (*a == '.') {
1364 if (a != last) {
1365 *a = '_';
1366 }
1367 } else
1368 if (!
1369 ((*a >= 'A' && *a <= 'Z') || (*a >= '0' && *a <= '9') || *a == '_'
1370 || *a == '/' || *a == '\\')) {
1371 *a = '_';
1372 }
1373 }
1374 }
1375
1376 /* ensure that there is no ../ (potential vulnerability) */
1377 fil_simplifie(afs->save);
1378
1379 /* convert name to UTF-8 ? Note: already done while parsing. */
1380 //if (charset != NULL && charset[0] != '\0') {
1381 // char *const s = hts_convertStringToUTF8(save, (int) strlen(save), charset);
1382
1383 // if (s != NULL) {
1384 // hts_log_print(opt, LOG_DEBUG,
1385 // "engine: save-name: charset conversion from '%s' to '%s' using charset '%s'",
1386 // save, s, charset);
1387 // strcpybuff(save, s);
1388 // free(s);
1389 // }
1390 //}
1391
1392 /* callback */
1393 RUN_CALLBACK5(opt, savename, adr_complete, fil_complete, referer_adr,
1394 referer_fil, afs->save);
1395
1396 hts_log_print(opt, LOG_DEBUG, "engine: save-name: local name: %s%s -> %s",
1397 adr, fil, afs->save);
1398
1399 /* Ensure that the MANDATORY "temporary" extension is set */
1400 if (ext_chg_delayed) {
1401 char *ptr;
1402 char *lastDot = NULL;
1403
1404 for(ptr = afs->save; *ptr != 0; ptr++) {
1405 if (*ptr == '.') {
1406 lastDot = ptr;
1407 } else if (*ptr == '/' || *ptr == '\\') {
1408 lastDot = NULL;
1409 }
1410 }
1411 if (lastDot == NULL) {
1412 strcatbuff(afs->save, "." DELAYED_EXT);
1413 } else if (!IS_DELAYED_EXT(afs->save)) {
1414 strcatbuff(lastDot, "." DELAYED_EXT);
1415 }
1416 }
1417 // enforce 260-character path limit before inserting destination path
1418 // note: 12 characters at least for WIN32, and 12 for ".99.delayed"
1419 // (MSDN) "When using an API to create a directory, the specified path
1420 // cannot be so long that you cannot append an 8.3 file name
1421 // (that is, the directory name cannot exceed MAX_PATH minus 12)."
1422 #define HTS_MAX_PATH_LEN ( 260 - 12 - 12 )
1423 #define MIN_LAST_SEG_RESERVE 12
1424 #define MAX_LAST_SEG_RESERVE 24
1425 #define MAX_SEG_LEN 48
1426 if (hts_stringLengthUTF8(afs->save) +
1427 hts_stringLengthUTF8(StringBuff(opt->path_html_utf8)) >=
1428 HTS_MAX_PATH_LEN) {
1429 // convert to Unicode (much simpler)
1430 size_t wsaveLen;
1431 hts_UCS4 *const wsave = hts_convertUTF8StringToUCS4(afs->save, strlen(afs->save), &wsaveLen);
1432 if (wsave != NULL) {
1433 const size_t parentLen =
1434 hts_stringLengthUTF8(StringBuff(opt->path_html_utf8));
1435 // parent path length is not insane (otherwise, ignore and pick 200 as
1436 // suffix length)
1437 const size_t maxLen =
1438 parentLen <
1439 HTS_MAX_PATH_LEN - HTS_MAX_PATH_LEN / 4
1440 ? HTS_MAX_PATH_LEN - parentLen : HTS_MAX_PATH_LEN;
1441 size_t i, j, lastSeg, lastSegSize, dirSize;
1442 char *saveFinal;
1443
1444 // pick up last segment
1445 for(i = 0, lastSeg = 0; wsave[i] != '\0'; i++) {
1446 if (wsave[i] == '/') {
1447 lastSeg = i + 1;
1448 }
1449 }
1450 lastSegSize = wsaveLen - lastSeg;
1451 if (lastSegSize > MAX_LAST_SEG_RESERVE) {
1452 lastSegSize = MAX_LAST_SEG_RESERVE;
1453 }
1454 else if (lastSegSize < MIN_LAST_SEG_RESERVE) {
1455 lastSegSize = MIN_LAST_SEG_RESERVE;
1456 }
1457
1458 // add as much pathes as we can.
1459 // note: i is in bytes, iUtf in characters
1460 for(i = 0, j = 0, dirSize = 0
1461 ; i + 1 < lastSeg && j + lastSegSize < maxLen; i++) {
1462 // reset segment counting
1463 if (wsave[i] == '/') {
1464 dirSize = 0;
1465 }
1466
1467 // copy if not too long
1468 if (dirSize < MAX_SEG_LEN) {
1469 wsave[j++] = wsave[i];
1470 dirSize++;
1471 }
1472 }
1473
1474 // last segment
1475 wsave[j++] = '/';
1476 #define MAX_UTF8_SEQ_CHARS 4
1477 for(i = lastSeg; wsave[i] != '\0' && j < maxLen; i++) {
1478 wsave[j++] = wsave[i];
1479 }
1480 // terminating \0
1481 wsave[j++] = '\0';
1482
1483 // copy final name and cleanup
1484 saveFinal = hts_convertUCS4StringToUTF8(wsave, j);
1485 if (saveFinal != NULL) {
1486 strcpybuff(afs->save, saveFinal);
1487 free(saveFinal);
1488 } else {
1489 hts_log_print(opt, LOG_ERROR, "Could not revert to UTF-8: %s%s",
1490 adr_complete, fil_complete);
1491 }
1492 free(wsave);
1493
1494 // log in debug
1495 hts_log_print(opt, LOG_DEBUG, "Too long filename shortened: %s%s => %s",
1496 adr_complete, fil_complete, afs->save);
1497 } else {
1498 hts_log_print(opt, LOG_ERROR, "Could not read UTF-8: %s", afs->save);
1499 }
1500
1501 // Re-check again ending space or dot after cut (see bug #5)
1502 cleanEndingSpaceOrDot(afs->save);
1503 }
1504 #undef MAX_UTF8_SEQ_CHARS
1505 #undef MIN_LAST_SEG_RESERVE
1506 #undef HTS_MAX_PATH_LEN
1507
1508 // chemin primaire éventuel A METTRE AVANT
1509 if (strnotempty(StringBuff(opt->path_html_utf8))) {
1510 char BIGSTK tempo[HTS_URLMAXSIZE * 2];
1511
1512 strcpybuff(tempo, StringBuff(opt->path_html_utf8));
1513 strcatbuff(tempo, afs->save);
1514 strcpybuff(afs->save, tempo);
1515 }
1516 // vérifier que le nom n'est pas déja pris...
1517 if (opt->liens != NULL) {
1518 int nom_ok;
1519
1520 do {
1521 int i;
1522
1523 //
1524 nom_ok = 1; // à priori bon
1525 // on part de la fin pour optimiser, plus les opti de taille pour
1526 // aller encore plus vite..
1527 #if DEBUG_SAVENAME
1528 printf("\nStart search\n");
1529 #endif
1530
1531 i = hash_read(hash, afs->save, NULL, HASH_STRUCT_FILENAME); // lecture type 0 (sav)
1532 if (i >= 0) {
1533 int sameAdr = (strfield2(heap(i)->adr, normadr) != 0);
1534 int sameFil;
1535
1536 // NO - URL hack is only for stripping // and www.
1537 //if (opt->urlhack != 0)
1538 // sameFil = ( strfield2(heap(i)->fil, normfil) != 0);
1539 //else
1540 sameFil = (strcmp(heap(i)->fil, normfil) == 0);
1541 if (sameAdr && sameFil) { // ok c'est le même lien, adresse déja définie
1542 /* Take the existing name not to screw up with cAsE sEnSiTiViTy of Linux/Unix */
1543 if (strcmp(heap(i)->sav, afs->save) != 0) {
1544 strcpybuff(afs->save, heap(i)->sav);
1545 }
1546 i = 0;
1547 #if DEBUG_SAVENAME
1548 printf("\nOK ALREADY DEFINED\n", 13, i);
1549 #endif
1550 } else { // utilisé par un AUTRE, changer de nom
1551 char BIGSTK tempo[HTS_URLMAXSIZE * 2];
1552 char *a = afs->save + strlen(afs->save) - 1;
1553 char *b;
1554 int n = 2;
1555 char collisionSeparator = ((opt->savename_83 != 2) ? '-' : '_');
1556
1557 tempo[0] = '\0';
1558
1559 #if DEBUG_SAVENAME
1560 printf("\nWRONG CASE UNMATCH : \n%s\n%s, REDEFINE\n", heap(i)->fil,
1561 fil_complete);
1562 #endif
1563 nom_ok = 0;
1564 i = 0;
1565
1566 while((a > afs->save) && (*a != '.') && (*a != '\\') && (*a != '/'))
1567 a--;
1568 if (*a == '.')
1569 strncatbuff(tempo, afs->save, a - afs->save);
1570 else
1571 strcatbuff(tempo, afs->save);
1572
1573 // tester la présence d'un -xx (ex: index-2.html -> index-3.html)
1574 b = tempo + strlen(tempo) - 1;
1575 while(isdigit((unsigned char) *b))
1576 b--;
1577 if (*b == collisionSeparator) {
1578 sscanf(b + 1, "%d", &n);
1579 *b = '\0'; // couper
1580 n++; // plus un
1581 }
1582 // en plus il faut gérer le 8-3 .. pas facile le client
1583 if (opt->savename_83) {
1584 int max;
1585 char *a = tempo + strlen(tempo) - 1;
1586
1587 while((a > tempo) && (*a != '/'))
1588 a--;
1589 if (*a == '/')
1590 a++;
1591 max = max_char - 1 - nombre_digit(n);
1592 if ((int) strlen(a) > max)
1593 *(a + max) = '\0'; // couper sinon il n'y aura pas la place!
1594 }
1595 // ajouter -xx (ex: index.html -> index-2.html)
1596 sprintf(tempo + strlen(tempo), "%c%d", collisionSeparator, n);
1597
1598 // ajouter extension
1599 if (*a == '.')
1600 strcatbuff(tempo, a);
1601
1602 strcpybuff(afs->save, tempo);
1603
1604 //printf("switched: %s\n",save);
1605
1606 } // if
1607 }
1608 #if DEBUG_SAVENAME
1609 printf("\nEnd search, %s\n", fil_complete);
1610 #endif
1611 } while(!nom_ok);
1612
1613 }
1614 //printf("'%s' %s %s\n",save,adr,fil);
1615
1616 return 0;
1617 }
1618
1619 /* nom avec md5 urilisé partout */
standard_name(char * b,const char * dot_pos,const char * nom_pos,const char * fil,int short_ver)1620 void standard_name(char *b, const char *dot_pos, const char *nom_pos, const char *fil,
1621 int short_ver) {
1622 char md5[32 + 2];
1623
1624 b[0] = '\0';
1625 /* Nom */
1626 if (dot_pos) {
1627 if (!short_ver) // Noms longs
1628 strncatbuff(b, nom_pos, (dot_pos - nom_pos));
1629 else
1630 strncatbuff(b, nom_pos, min(dot_pos - nom_pos, 8));
1631 } else {
1632 if (!short_ver) // Noms longs
1633 strcatbuff(b, nom_pos);
1634 else
1635 strncatbuff(b, nom_pos, 8);
1636 }
1637 /* MD5 - 16 bits */
1638 strncatbuff(b, url_md5(md5, fil), 4);
1639 /* Ext */
1640 if (dot_pos) {
1641 strcatbuff(b, ".");
1642 if (!short_ver) // Noms longs
1643 strcatbuff(b, dot_pos + 1);
1644 else
1645 strncatbuff(b, dot_pos + 1, 3);
1646 }
1647 // Allow extensionless
1648 #ifdef DO_NOT_ALLOW_EXTENSIONLESS
1649 else {
1650 if (!short_ver) // Noms longs
1651 strcatbuff(b, DEFAULT_EXT);
1652 else
1653 strcatbuff(b, DEFAULT_EXT_SHORT);
1654 }
1655 #endif
1656 }
1657
1658 /* Petit md5 */
url_md5(char * digest,const char * fil)1659 char *url_md5(char *digest, const char *fil) {
1660 char *a;
1661
1662 digest[0] = '\0';
1663 a = strchr(fil, '?');
1664 if (a) {
1665 if (strlen(a)) {
1666 char BIGSTK buff[HTS_URLMAXSIZE * 2];
1667
1668 a++;
1669 digest[0] = buff[0] = '\0';
1670 strcatbuff(buff, a); /* query string MD5 */
1671 domd5mem(buff, strlen(buff), digest, 1);
1672 }
1673 }
1674 return digest;
1675 }
1676
1677 // interne à url_savename: ajoute une chaîne à une autre avec \ -> /
url_savename_addstr(char * d,const char * s)1678 void url_savename_addstr(char *d, const char *s) {
1679 int i = (int) strlen(d);
1680
1681 while(*s) {
1682 if (*s == '\\') // remplacer \ par des /
1683 d[i++] = '/';
1684 else
1685 d[i++] = *s;
1686 s++;
1687 }
1688 d[i] = '\0';
1689 }
1690
1691 /* "filename" should be at least 64 bytes. */
url_savename_refname(const char * adr,const char * fil,char * filename)1692 void url_savename_refname(const char *adr, const char *fil, char *filename) {
1693 unsigned char bindigest[16];
1694 struct MD5Context ctx;
1695
1696 MD5Init(&ctx, 0);
1697 MD5Update(&ctx, (const unsigned char *) adr, (int) strlen(adr));
1698 MD5Update(&ctx, (const unsigned char *) ",", 1);
1699 MD5Update(&ctx, (const unsigned char *) fil, (int) strlen(fil));
1700 MD5Final(bindigest, &ctx);
1701 sprintf(filename,
1702 CACHE_REFNAME "/" "%02x%02x%02x%02x%02x%02x%02x%02x"
1703 "%02x%02x%02x%02x%02x%02x%02x%02x" ".ref", bindigest[0], bindigest[1],
1704 bindigest[2], bindigest[3], bindigest[4], bindigest[5], bindigest[6],
1705 bindigest[7], bindigest[8], bindigest[9], bindigest[10],
1706 bindigest[11], bindigest[12], bindigest[13], bindigest[14],
1707 bindigest[15]);
1708 }
1709
1710 /* note: return a local filename */
url_savename_refname_fullpath(httrackp * opt,const char * adr,const char * fil)1711 char *url_savename_refname_fullpath(httrackp * opt, const char *adr,
1712 const char *fil) {
1713 char digest_filename[64];
1714
1715 url_savename_refname(adr, fil, digest_filename);
1716 return fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
1717 StringBuff(opt->path_log), digest_filename);
1718 }
1719
1720 /* remove refname if any */
url_savename_refname_remove(httrackp * opt,const char * adr,const char * fil)1721 void url_savename_refname_remove(httrackp * opt, const char *adr,
1722 const char *fil) {
1723 char *filename = url_savename_refname_fullpath(opt, adr, fil);
1724
1725 (void) UNLINK(filename);
1726 }
1727