[prev in list] [next in list] [prev in thread] [next in thread]
List: spamassassin-devel
Subject: Re: svn commit: r1880308 - /spamassassin/trunk/masses/hit-frequencies
From: "Kevin A. McGrail" <kmcgrail () apache ! org>
Date: 2020-07-26 14:07:44
Message-ID: CAMMMAUH7YZiLrfcmhzZ1Gz5PnGU+pQtRFJOphwJ7qoVkBfizEg () mail ! gmail ! com
[Download RAW message or body]
Nice!
On Sun, Jul 26, 2020, 01:50 <hege@apache.org> wrote:
> Author: hege
> Date: Sun Jul 26 05:50:00 2020
> New Revision: 1880308
>
> URL: http://svn.apache.org/viewvc?rev=1880308&view=rev
> Log:
> Tweaks to increase speed, cut runtime in half
>
> Modified:
> spamassassin/trunk/masses/hit-frequencies
>
> Modified: spamassassin/trunk/masses/hit-frequencies
> URL:
> http://svn.apache.org/viewvc/spamassassin/trunk/masses/hit-frequencies?rev=1880308&r1=1880307&r2=1880308&view=diff
>
> ==============================================================================
> --- spamassassin/trunk/masses/hit-frequencies (original)
> +++ spamassassin/trunk/masses/hit-frequencies Sun Jul 26 05:50:00 2020
> @@ -805,52 +805,48 @@ sub compute_overlaps_for_rule {
> my %overlaps_ham1r = ();
> my %overlaps_spam1r = ();
>
> - foreach my $r2 (keys %hmap_spam) {
> - next if $r1 eq $r2;
> -
> - # require that both rules have at least 1 hit
> - next unless ($freq_spam{$r1} && $freq_spam{$r2});
> -
> - my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1,
> - $hmap_spam{$r2}, $hmap_spam{$r1});
> -
> - if ($a1ina2 > 0)
> - {
> - $overlaps_spam1r{$r2} = $a1ina2;
> -
> - if (exists $overlaps_spam1{$a1ina2})
> - { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> - else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; }
> -
> - if (exists $overlaps_spam2{$a2ina1})
> - { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; }
> - else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; }
> + if ($freq_spam{$r1}) {
> + foreach my $r2 (keys %hmap_spam) {
> + next if $r1 eq $r2;
> +
> + my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1,
> + $hmap_spam{$r2}, $hmap_spam{$r1});
> +
> + if ($a1ina2 > 0)
> + {
> + $overlaps_spam1r{$r2} = $a1ina2;
> +
> + if (exists $overlaps_spam1{$a1ina2})
> + { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> + else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; }
> +
> + if (exists $overlaps_spam2{$a2ina1})
> + { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; }
> + else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; }
> + }
> }
> -
> }
>
> - foreach my $r2 (keys %hmap_ham) {
> - next if $r1 eq $r2;
> -
> - # require that both rules have at least 1 hit
> - next unless ($freq_ham{$r1} && $freq_ham{$r2});
> -
> - my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2,
> - $hmap_ham{$r2}, $hmap_ham{$r1});
> -
> - if ($a1ina2 > 0)
> - {
> - $overlaps_ham1r{$r2} = $a1ina2;
> -
> - if (exists $overlaps_ham1{$a1ina2})
> - { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> - else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; }
> -
> - if (exists $overlaps_ham2{$a2ina1})
> - { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; }
> - else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; }
> + if ($freq_ham{$r1}) {
> + foreach my $r2 (keys %hmap_ham) {
> + next if $r1 eq $r2;
> +
> + my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2,
> + $hmap_ham{$r2}, $hmap_ham{$r1});
> +
> + if ($a1ina2 > 0)
> + {
> + $overlaps_ham1r{$r2} = $a1ina2;
> +
> + if (exists $overlaps_ham1{$a1ina2})
> + { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
> + else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; }
> +
> + if (exists $overlaps_ham2{$a2ina1})
> + { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; }
> + else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; }
> + }
> }
> -
> }
>
> _print_overlap_ratios($r1, \%overlaps_spam1, \%overlaps_spam2, "spam",
> \%overlaps_ham1r, "ham");
> @@ -934,25 +930,23 @@ sub _prettify_overlap_rules {
> sub _hmap_to_overlap_ratio {
> my ($r1, $r2, $hmap1, $hmap2) = @_;
>
> - $hmap1 ||= '';
> - $hmap2 ||= '';
> - if ($hmap1 !~ /[^\000]/ || $hmap2 !~ /[^\000]/) {
> - # no hits on either! this would normally give a 100% hitrate match,
> - # but that's misleading -- so hide it by giving it a 0% overlap.
> - #
> - # also, ignore cases where there are no hits on *one* of the rules,
> - # while there are hits on the other -- after all, if one rule doesn't
> - # have a single hit, it cannot overlap.
> - #
> - return (0,0);
> - }
> -
> # my $i; for ($i = 0; $i < length($hmap1)*8; $i++) { print
> vec($hmap1,$i,1); } print "\n"; for ($i = 0; $i < length($hmap2)*8; $i++) {
> print vec($hmap2,$i,1); } print "\n";
>
> # count bits in each, so we can show when one is fully subsumed by
> another
> # with perl's support for bitstring ops, we get C speed here, nice!
> +
> + # no hits on either? this would normally give a 100% hitrate match,
> + # but that's misleading -- so hide it by giving it a 0% overlap.
> + #
> + # also, ignore cases where there are no hits on *one* of the rules,
> + # while there are hits on the other -- after all, if one rule doesn't
> + # have a single hit, it cannot overlap.
> +
> my $a1 = unpack("%32b*", $hmap1);
> + return (0,0) unless $a1;
> my $a2 = unpack("%32b*", $hmap2);
> + return (0,0) unless $a2;
> +
> my $a1_and_a2 = unpack("%32b*", ($hmap1 & $hmap2));
>
> # round rather than truncate
>
>
>
[Attachment #3 (text/html)]
<div dir="auto">Nice!</div><br><div class="gmail_quote"><div dir="ltr" \
class="gmail_attr">On Sun, Jul 26, 2020, 01:50 <<a \
href="mailto:hege@apache.org">hege@apache.org</a>> wrote:<br></div><blockquote \
class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc \
solid;padding-left:1ex">Author: hege<br>
Date: Sun Jul 26 05:50:00 2020<br>
New Revision: 1880308<br>
<br>
URL: <a href="http://svn.apache.org/viewvc?rev=1880308&view=rev" rel="noreferrer \
noreferrer" target="_blank">http://svn.apache.org/viewvc?rev=1880308&view=rev</a><br>
Log:<br>
Tweaks to increase speed, cut runtime in half<br>
<br>
Modified:<br>
spamassassin/trunk/masses/hit-frequencies<br>
<br>
Modified: spamassassin/trunk/masses/hit-frequencies<br>
URL: <a href="http://svn.apache.org/viewvc/spamassassin/trunk/masses/hit-frequencies?rev=1880308&r1=1880307&r2=1880308&view=diff" \
rel="noreferrer noreferrer" \
target="_blank">http://svn.apache.org/viewvc/spamassassin/trunk/masses/hit-frequencies?rev=1880308&r1=1880307&r2=1880308&view=diff</a><br>
==============================================================================<br>
--- spamassassin/trunk/masses/hit-frequencies (original)<br>
+++ spamassassin/trunk/masses/hit-frequencies Sun Jul 26 05:50:00 2020<br>
@@ -805,52 +805,48 @@ sub compute_overlaps_for_rule {<br>
my %overlaps_ham1r = ();<br>
my %overlaps_spam1r = ();<br>
<br>
- foreach my $r2 (keys %hmap_spam) {<br>
- next if $r1 eq $r2;<br>
-<br>
- # require that both rules have at least 1 hit<br>
- next unless ($freq_spam{$r1} && $freq_spam{$r2});<br>
-<br>
- my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1,<br>
- $hmap_spam{$r2}, \
$hmap_spam{$r1});<br>
-<br>
- if ($a1ina2 > 0)<br>
- {<br>
- $overlaps_spam1r{$r2} = $a1ina2;<br>
-<br>
- if (exists $overlaps_spam1{$a1ina2})<br>
- { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; \
}<br>
- else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; }<br>
-<br>
- if (exists $overlaps_spam2{$a2ina1})<br>
- { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; \
}<br>
- else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; }<br>
+ if ($freq_spam{$r1}) {<br>
+ foreach my $r2 (keys %hmap_spam) {<br>
+ next if $r1 eq $r2;<br>
+<br>
+ my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1,<br>
+ $hmap_spam{$r2}, \
$hmap_spam{$r1});<br> +<br>
+ if ($a1ina2 > 0)<br>
+ {<br>
+ $overlaps_spam1r{$r2} = $a1ina2;<br>
+<br>
+ if (exists $overlaps_spam1{$a1ina2})<br>
+ { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; \
}<br> + else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; }<br>
+<br>
+ if (exists $overlaps_spam2{$a2ina1})<br>
+ { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; \
}<br> + else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; }<br>
+ }<br>
}<br>
-<br>
}<br>
<br>
- foreach my $r2 (keys %hmap_ham) {<br>
- next if $r1 eq $r2;<br>
-<br>
- # require that both rules have at least 1 hit<br>
- next unless ($freq_ham{$r1} && $freq_ham{$r2});<br>
-<br>
- my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2,<br>
- $hmap_ham{$r2}, \
$hmap_ham{$r1});<br>
-<br>
- if ($a1ina2 > 0)<br>
- {<br>
- $overlaps_ham1r{$r2} = $a1ina2;<br>
-<br>
- if (exists $overlaps_ham1{$a1ina2})<br>
- { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; }<br>
- else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; }<br>
-<br>
- if (exists $overlaps_ham2{$a2ina1})<br>
- { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; }<br>
- else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; }<br>
+ if ($freq_ham{$r1}) {<br>
+ foreach my $r2 (keys %hmap_ham) {<br>
+ next if $r1 eq $r2;<br>
+<br>
+ my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2,<br>
+ $hmap_ham{$r2}, \
$hmap_ham{$r1});<br> +<br>
+ if ($a1ina2 > 0)<br>
+ {<br>
+ $overlaps_ham1r{$r2} = $a1ina2;<br>
+<br>
+ if (exists $overlaps_ham1{$a1ina2})<br>
+ { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; \
}<br> + else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; }<br>
+<br>
+ if (exists $overlaps_ham2{$a2ina1})<br>
+ { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; \
}<br> + else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; }<br>
+ }<br>
}<br>
-<br>
}<br>
<br>
_print_overlap_ratios($r1, \%overlaps_spam1, \%overlaps_spam2, "spam", \
\%overlaps_ham1r, "ham");<br> @@ -934,25 +930,23 @@ sub \
_prettify_overlap_rules {<br> sub _hmap_to_overlap_ratio {<br>
my ($r1, $r2, $hmap1, $hmap2) = @_;<br>
<br>
- $hmap1 ||= '';<br>
- $hmap2 ||= '';<br>
- if ($hmap1 !~ /[^\000]/ || $hmap2 !~ /[^\000]/) {<br>
- # no hits on either! this would normally give a 100% hitrate match,<br>
- # but that's misleading -- so hide it by giving it a 0% overlap.<br>
- #<br>
- # also, ignore cases where there are no hits on *one* of the rules,<br>
- # while there are hits on the other -- after all, if one rule doesn't<br>
- # have a single hit, it cannot overlap.<br>
- #<br>
- return (0,0);<br>
- }<br>
-<br>
# my $i; for ($i = 0; $i < length($hmap1)*8; $i++) { print vec($hmap1,$i,1); \
} print "\n"; for ($i = 0; $i < length($hmap2)*8; $i++) { print \
vec($hmap2,$i,1); } print "\n";<br> <br>
# count bits in each, so we can show when one is fully subsumed by another<br>
# with perl's support for bitstring ops, we get C speed here, nice!<br>
+<br>
+ # no hits on either? this would normally give a 100% hitrate match,<br>
+ # but that's misleading -- so hide it by giving it a 0% overlap.<br>
+ #<br>
+ # also, ignore cases where there are no hits on *one* of the rules,<br>
+ # while there are hits on the other -- after all, if one rule doesn't<br>
+ # have a single hit, it cannot overlap.<br>
+<br>
my $a1 = unpack("%32b*", $hmap1);<br>
+ return (0,0) unless $a1;<br>
my $a2 = unpack("%32b*", $hmap2);<br>
+ return (0,0) unless $a2;<br>
+<br>
my $a1_and_a2 = unpack("%32b*", ($hmap1 & $hmap2));<br>
<br>
# round rather than truncate<br>
<br>
<br>
</blockquote></div>
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic