[prev in list] [next in list] [prev in thread] [next in thread]
List: wget
Subject: patch / feature request: option for filtering .html
From: Loki <slack () eskimo ! com>
Date: 1998-07-22 22:16:16
[Download RAW message or body]
wget downloads all the .html files and THEN removes
them if they are not in the accept list or are in reject. I made a small
patch (for 1.5.2) to add an option to it (--filter-html , -nZ) so it dosnt
download any html files that dont pass the filters (except "" , index.html).
Without this wget -k -r --no-parent -A 5.10_hydrogen\*
http://www.wired.com/collections/space_exploration/5.10_hydrogen1.html
would download tons of htmls, then delete them, when all I want are the
ones matching the accept, accessable from the start url.
["wget-patch" (TEXT/PLAIN)]
diff -u --recursive wget-1.5.2/src/init.c wget-1.5.2-patched/src/init.c
--- wget-1.5.2/src/init.c Tue Jun 23 16:29:18 1998
+++ wget-1.5.2-patched/src/init.c Wed Jul 22 16:02:38 1998
@@ -103,6 +103,7 @@
{ "dotstyle", NULL, cmd_spec_dotstyle },
{ "excludedirectories", &opt.excludes, cmd_directory_vector },
{ "excludedomains", &opt.exclude_domains, cmd_vector },
+ { "filterhtml", &opt.filterhtml, cmd_boolean },
{ "followftp", &opt.follow_ftp, cmd_boolean },
{ "forcehtml", &opt.force_html, cmd_boolean },
{ "ftpproxy", &opt.ftp_proxy, cmd_string },
diff -u --recursive wget-1.5.2/src/main.c wget-1.5.2-patched/src/main.c
--- wget-1.5.2/src/main.c Sat May 9 11:05:26 1998
+++ wget-1.5.2-patched/src/main.c Wed Jul 22 16:03:26 1998
@@ -205,6 +205,7 @@
{ "debug", no_argument, NULL, 'd' },
{ "dont-remove-listing", no_argument, NULL, 21 },
{ "email-address", no_argument, NULL, 'E' }, /* undocumented (debug) */
+ { "filter-html", no_argument, NULL, 24 },
{ "follow-ftp", no_argument, NULL, 14 },
{ "force-directories", no_argument, NULL, 'x' },
{ "force-hier", no_argument, NULL, 'x' }, /* obsolete */
@@ -334,6 +335,9 @@
case 22:
setval ("simplehostcheck", "on");
break;
+ case 24:
+ setval ("filterhtml", "on");
+ break;
case 'b':
setval ("background", "on");
break;
@@ -511,6 +515,9 @@
case 'p':
setval ("noparent", "on");
break;
+ case 'Z':
+ setval ("filterhtml", "on");
+ break;
default:
printf (_("%s: illegal option -- `-n%c'\n"), exec_name, *p);
print_usage ();
diff -u --recursive wget-1.5.2/src/options.h wget-1.5.2-patched/src/options.h
--- wget-1.5.2/src/options.h Tue Apr 28 16:29:40 1998
+++ wget-1.5.2-patched/src/options.h Wed Jul 22 16:04:48 1998
@@ -36,6 +36,8 @@
int relative_only; /* Follow only relative links. */
int no_parent; /* Restrict access to the parent
directory. */
+ int filterhtml; /* Filters html with accept/reject options
+ BEFORE downloading. */
int simple_check; /* Should we use simple checking
(strcmp) or do we create a host
hash and call gethostbyname? */
diff -u --recursive wget-1.5.2/src/recur.c wget-1.5.2-patched/src/recur.c
--- wget-1.5.2/src/recur.c Fri May 1 12:32:08 1998
+++ wget-1.5.2-patched/src/recur.c Wed Jul 22 16:22:14 1998
@@ -308,9 +308,9 @@
If the file *is* supposed to be HTML, it will *not* be
subject to acc/rej rules. That's why the `!'. */
if (!
|| (((suf = suffix (constr)) != NULL)
- && (!strcmp (suf, "html") || !strcmp (suf, "htm")))))
+ && (!opt.filterhtml) && (!strcmp (suf, "html") || !strcmp (suf, "htm")))))
{
if (!acceptable (u->file))
{
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic