[prev in list] [next in list] [prev in thread] [next in thread] 

List:       wget
Subject:    patch / feature request: option for filtering .html
From:       Loki <slack () eskimo ! com>
Date:       1998-07-22 22:16:16
[Download RAW message or body]

wget downloads all the .html files and THEN removes
them if they are not in the accept list or are in reject. I made a small
patch (for 1.5.2) to add an option to it (--filter-html , -nZ) so it dosnt
download any html files that dont pass the filters (except "" , index.html).

Without this wget -k -r --no-parent -A 5.10_hydrogen\*
http://www.wired.com/collections/space_exploration/5.10_hydrogen1.html

would download tons of htmls, then delete them, when all I want are the 
ones matching the accept, accessable from the start url.


["wget-patch" (TEXT/PLAIN)]

diff -u --recursive wget-1.5.2/src/init.c wget-1.5.2-patched/src/init.c
--- wget-1.5.2/src/init.c	Tue Jun 23 16:29:18 1998
+++ wget-1.5.2-patched/src/init.c	Wed Jul 22 16:02:38 1998
@@ -103,6 +103,7 @@
   { "dotstyle",		NULL,			cmd_spec_dotstyle },
   { "excludedirectories", &opt.excludes,	cmd_directory_vector },
   { "excludedomains",	&opt.exclude_domains,	cmd_vector },
+  { "filterhtml",	&opt.filterhtml,	cmd_boolean },
   { "followftp",	&opt.follow_ftp,	cmd_boolean },
   { "forcehtml",	&opt.force_html,	cmd_boolean },
   { "ftpproxy",		&opt.ftp_proxy,		cmd_string },
diff -u --recursive wget-1.5.2/src/main.c wget-1.5.2-patched/src/main.c
--- wget-1.5.2/src/main.c	Sat May  9 11:05:26 1998
+++ wget-1.5.2-patched/src/main.c	Wed Jul 22 16:03:26 1998
@@ -205,6 +205,7 @@
     { "debug", no_argument, NULL, 'd' },
     { "dont-remove-listing", no_argument, NULL, 21 },
     { "email-address", no_argument, NULL, 'E' }, /* undocumented (debug) */
+    { "filter-html", no_argument, NULL, 24 },
     { "follow-ftp", no_argument, NULL, 14 },
     { "force-directories", no_argument, NULL, 'x' },
     { "force-hier", no_argument, NULL, 'x' }, /* obsolete */
@@ -334,6 +335,9 @@
 	case 22:
 	  setval ("simplehostcheck", "on");
 	  break;
+	case 24:
+	  setval ("filterhtml", "on");
+	  break;
 	case 'b':
 	  setval ("background", "on");
 	  break;
@@ -511,6 +515,9 @@
 		case 'p':
 		  setval ("noparent", "on");
 		  break;
+		case 'Z':
+		setval ("filterhtml", "on");
+		break;
 		default:
 		  printf (_("%s: illegal option -- `-n%c'\n"), exec_name, *p);
 		  print_usage ();
diff -u --recursive wget-1.5.2/src/options.h wget-1.5.2-patched/src/options.h
--- wget-1.5.2/src/options.h	Tue Apr 28 16:29:40 1998
+++ wget-1.5.2-patched/src/options.h	Wed Jul 22 16:04:48 1998
@@ -36,6 +36,8 @@
   int relative_only;		/* Follow only relative links. */
   int no_parent;		/* Restrict access to the parent
 				   directory.  */
+  int filterhtml;		/* Filters html with accept/reject options
+				   BEFORE downloading. */
   int simple_check;		/* Should we use simple checking
 				   (strcmp) or do we create a host
 				   hash and call gethostbyname? */
diff -u --recursive wget-1.5.2/src/recur.c wget-1.5.2-patched/src/recur.c
--- wget-1.5.2/src/recur.c	Fri May  1 12:32:08 1998
+++ wget-1.5.2-patched/src/recur.c	Wed Jul 22 16:22:14 1998
@@ -308,9 +308,9 @@
 	     If the file *is* supposed to be HTML, it will *not* be
 	     subject to acc/rej rules.  That's why the `!'.  */
 	  if (!
 	       || (((suf = suffix (constr)) != NULL)
-		   && (!strcmp (suf, "html") || !strcmp (suf, "htm")))))
+		   && (!opt.filterhtml) && (!strcmp (suf, "html") || !strcmp (suf, "htm")))))
 	    {
 	      if (!acceptable (u->file))
 		{


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic