+23Nov22=======================================================================
+1. The BadWord system has been rewritten. This change is pretty radical and
+ needs to be used with care as a word that is entered will be reduced to the
+ minimum sized string needed to match that word.
+
+ This is effective a sysop command changeable version of the file badw_regex
+ but in a much more sysop friendly form. The (un)set/badword <word>...
+ commands now update the /spider/local_data/badword.new file in real time
+ without having to mess about with editing files and running load/badword.
+ load/badword still works, but you should now never need it.
19Nov22=======================================================================
1. "Fix" Badword detection in spots and announces etc.
NOTE: setting $DXCommandmode::maxbadcount to 0 (default 3) will disable
'^ann?o?u?n?c?e?/full', 'announce full', 'announce',
'^ann?o?u?n?c?e?/sysop', 'announce sysop', 'announce',
'^ann?o?u?n?c?e?/(.*)$', 'announce $1', 'announce',
+ '^add/badwo?r?d?$', 'set/badword $1', 'set/badword',
],
'b' => [
'^b$', 'bye', 'bye',
'^cre?a?t?e?$', 'apropos create', 'apropos',
],
'd' => [
+ '^dele?t?e?/badwo?r?d?$', 'unset/badword $1', 'unset/badword',
'^dele?t?e?/fu', 'kill full', 'kill',
'^dele?t?e?$', 'kill', 'kill',
'^dir?e?c?t?o?r?y?/a\w*', 'directory all', 'directory',
This command will also stop TALK and ANNOUNCE/FULL from any user marked
as a BADSPOTTER.
-=== 6^SET/BADWORD <word>..^Stop things with this word being propagated
-=== 6^UNSET/BADWORD <word>..^Propagate things with this word again
+=== 6^SET/BADWORD <word>..^Stop things like this word being propagated
Setting a word as a 'badword' will prevent things like spots,
announces or talks with this word in the the text part from going any
further. They will not be displayed and they will not be sent onto
other nodes.
-The word must be written in full, no wild cards are allowed eg:-
+This has changed its meaning from the master release. All words entered
+are reduced to the minimum regex that will match words starting like
+this one:
+
+ set/badword annihilate
+
+will stop anything that starts with these words in the text
+like this:
+
+ annihilate annihilated
+
+but it will also stop things like this:
- set/badword annihilate annihilated annihilation
+ anihilate annni11ihhh ii lllattt eee ddd
-will stop anything with these words in the text.
+A few common 'leet' substitutions are automatically matched:
- unset/badword annihilated
+ b0ll0cks bo0lll0ccckks fr1iig
-will allow text with this word again.
+and so on
+
+=== 6^UNSET/BADWORD <word>..^Propagate things like this word again
+This is the opposite of set/badword <word>
+
+ unset/badword fred
+
+will allow text with this word again (if it has been set as a bad word.
=== 0^SET/BEEP^Add a beep to DX and other messages on your terminal
=== 0^UNSET/BEEP^Stop beeps for DX and other messages on your terminal
return (1, $self->msg('e5')) if $self->remotecmd;
# are we permitted?
return (1, $self->msg('e5')) if $self->priv < 6;
-$line = join(' ', map {s|[/-]\d+$||; $_} split(/\s+/, $line));
-return $BadWords::badword->set(8, $self->msg('e6'), $self, $line);
-
+my @words = split /\s+/, uc $line;
+my @out;
+my $count = 0;
+foreach my $w (@words) {
+ my @in;
+
+ if (@in = BadWords::check($w)) {
+ push @out, "BadWord $w already matched by '$in[0]', ignored";
+ } else {
+ @in = BadWords::add_regex($w);
+ push @out, "BadWord $w added as '$in[0]'";
+ $count++;
+ }
+}
+if ($count) {
+ BadWords::generate_regex();
+ BadWords::put();
+}
+return (1, @out);
+
return (1, $self->msg('e5')) if $self->remotecmd;
# are we permitted?
return (1, $self->msg('e5')) if $self->priv < 6;
-return $BadWords::badword->show(1, $self);
+my @out;
+my @l;
+my $count = 0;
+
+if ($line =~ /^\s*full/i) {
+ foreach my $w (BadWords::list_regex(1)) {
+ ++$count;
+ push @out, $w;
+ }
+}
+else {
+ foreach my $w (BadWords::list_regex()) {
+ ++$count;
+ if (@l >= 5) {
+ push @out, sprintf "%-12s %-12s %-12s %-12s %-12s", @l;
+ @l = ();
+ }
+ push @l, $w;
+ }
+ push @l, "" while @l < 5;
+ push @out, sprintf "%-12s %-12s %-12s %-12s %-12s", @l;
+}
+
+push @out, "$count BadWords";
+
+return (1, @out);
return (1, $self->msg('e5')) if $self->remotecmd;
# are we permitted?
return (1, $self->msg('e5')) if $self->priv < 6;
-$line = join(' ', map {s|[/-]\d+$||; $_} split(/\s+/, $line));
-return $BadWords::badword->unset(8, $self->msg('e6'), $self, $line);
+
+my @words = split /\s+/, uc $line;
+my @out;
+my $count = 0;
+foreach my $w (@words) {
+ my @in;
+
+ unless (@in = BadWords::check($w)) {
+ push @out, "BadWord $w not defined, ignored";
+ } else {
+ @in = BadWords::del_regex($w);
+ push @out, "BadWord $w removed";
+ $count++;
+ }
+}
+if ($count) {
+ BadWords::generate_regex();
+ BadWords::put();
+}
+return (1, @out);
--- /dev/null
+ACBAR
+AGBAR
+AKELA
+ALAH
+ALHA
+ANIHILATE
+ANIHILATION
+ANUS
+ARSE
+ATACK
+AVENGER
+BARENDERO
+BARSTARD
+BASTARD
+BASURA
+BINLADAN
+BINLADEN
+BLOD
+BOLCK
+BOLOCK
+BOMB
+BUGER
+BUST
+CABRON
+CHRISTIAN
+COCA
+COCK
+COJONES
+CORNUDO
+CORNUPETA
+CORUPTO
+CRAP
+CUEZETA
+CUNT
+DAMN
+DEADH
+DEATH
+DESGRACIADO
+DETH
+DICKHEAD
+DUMBAS
+DXFUN
+ENFERMITO
+ENFERMO
+ENVIDIOSO
+ESTUPIDO
+EXPLOSIVE
+FOLA
+FUCK
+FUK
+FUNKER
+HACKED
+HIJOPUTA
+HIJOS
+HITLER
+IDIOT
+IMBECIL
+JERK
+JIHAD
+JODAN
+JODE
+JODIENDO
+JOETE
+KIL
+KLOT
+LADEN
+LADIN
+LADRON
+MAFIA
+MAMON
+MARICA
+MARICONAZO
+MASMURDER
+MEGATON
+MENTAL
+MENTIROSO
+MIERDA
+MORO
+MOTHERFUCKER
+MULSIM
+MURDER
+MUSLIM
+NUKE
+OSAMA
+PEDERASTA
+PIS
+PLONKER
+POLA
+POLITIC
+POYA
+PUDENDA
+PUDENDUM
+PUSY
+PUTA
+PUTIN
+PUTO
+RADIOBASURA
+RAGHEAD
+RATA
+RELIGION
+RKOAL
+RKOKILER
+RKOPUTIN
+ROFRE
+ROPUTIN
+SHIT
+SLAG
+SLAUGHTER
+SLAVE
+SOD
+SPOTWAR
+STOPUTIN
+STOPWAR
+STUPID
+SUBNORMAL
+TERORIST
+TIT
+TONTO
+TOSER
+TOSPOT
+TRUCK
+TRUK
+TWAT
+URE
+VENGADOR
+WANK
+WASOCK
+WHORE
use DXUtil;
use DXVars;
-use DXHash;
use DXDebug;
use IO::File;
-use vars qw($badword $regexcode);
+our $regex; # the big bad regex generated from @relist
+our @relist; # the list of regexes to try, record = [canonical word, regex]
+my %in; # the collection of words we are building up and their regexes
-our $regex;
-# load the badwords file
+# load the badwords file(s)
sub load
{
- my $bwfn = localdata("badword");
- filecopy("$main::data.issue", $bwfn) unless -e $bwfn;
-
- my @out;
+ %in = ();
+ @relist = ();
+ $regex = '';
- $badword = new DXHash "badword";
+ my @inw;
+ my @out;
+ my $wasold;
- push @out, create_regex();
- return @out;
-}
-sub create_regex
-{
- $regex = localdata("badw_regex");
- filecopy("$regex.gb.issue", $regex) unless -e $regex;
+ my $newfn = localdata("badword.new");
+ filecopy("$main::data/badword.new.issue", $newfn) unless -e $newfn;
+ if (-e $newfn) {
+ # new style
+ dbg("BadWords: Found new style badword.new file");
+ my $fh = new IO::File $newfn;
+ if ($fh) {
+ while (<$fh>) {
+ chomp;
+ next if /^\s*\#/;
+ add_regex(uc $_);
+ }
+ $fh->close;
+ @relist = sort {$a->[0] cmp $b->[0]} @relist; # just in case...
+ dbg("BadWords: " . scalar @relist . " new style badwords read");
+ }
+ else {
+ my $l = "BadWords: can't open $newfn $!";
+ dbg($l);
+ push @out, $l;
+ return @out;
+ }
+ }
+ else {
+
+ # using old style files
+ my $bwfn = localdata("badword");
+ filecopy("$main::data/badword.issue", $bwfn) unless -e $bwfn;
- my @out;
- my $fh = new IO::File $regex;
+ # parse the existing static file
+ dbg("BadWords: Using old style badword file");
- if ($fh) {
- my $s = "sub { my \$str = shift; my \@out; \n";
- while (<$fh>) {
- chomp;
- next if /^\s*\#/;
- my @list = split " ";
- for (@list) {
- # create a closure for each word so that it matches stuff with spaces/punctuation
- # and repeated characters in it
- my $w = uc $_;
- my @l = split //, $w;
- my $e = join '+[\s\W]*', @l;
- $s .= qq{push \@out, \$1 if \$str =~ m|\\b($e+)|;\n};
+ my $fh = new IO::File $bwfn;
+ if ($fh) {
+ my $line = 0;
+ while (<$fh>) {
+ chomp;
+ ++$line;
+ next if /^\s*\#/;
+ unless (/\w+\s+=>\s+\d+,/) {
+ dbg("BadWords: syntax error in $bwfn:$line '$_'");
+ next;
+ }
+ my @line = split /\s+/, uc $_;
+ shift @line unless $line[0];
+ push @inw, $line[0];
}
+ $fh->close;
}
- $s .= "return \@out;\n}";
- $regexcode = eval $s;
- dbg($s) if isdbg('badword');
- if ($@) {
- @out = ($@);
- dbg($@);
+ else {
+ my $l = "BadWords: can't open $bwfn $!";
+ dbg($l);
+ push @out, $l;
return @out;
}
- $fh->close;
- } else {
- my $l = "can't open $regex $!";
- dbg($l);
- push @out, $l;
+
+ # do the same for badw_regex
+ my $regexfn = localdata("badw_regex");
+ filecopy("$main::data/badw_regex.gb.issue", $regexfn) unless -e $regexfn;
+ dbg("BadWords: Using old style badw_regex file");
+ $fh = new IO::File $regexfn;
+
+ if ($fh) {
+ while (<$fh>) {
+ chomp;
+ next if /^\s*\#/;
+ next if /^\s*$/;
+ push @inw, split /\s+/, uc $_;
+ }
+ $fh->close;
+ }
+ else {
+ my $l = "BadWords: can't open $regexfn $!";
+ dbg($l);
+ push @out, $l;
+ return @out;
+ }
+
+ ++$wasold;
}
+
+ # catch most of the potential duplicates
+ @inw = sort @inw;
+ for (@inw) {
+ add_regex($_);
+ }
+
+ # create the master regex
+ generate_regex();
+ # use new style from now on
+ put() if $wasold;
+
+
return @out;
}
+sub generate_regex
+{
+ my $res;
+ @relist = sort {$a->[0] cmp $b->[0]} @relist;
+ for (@relist) {
+ $res .= qq{(?:$_->[1]) |\n};
+ }
+ $res =~ s/\s*\|\s*$//;
+ $regex = qr/\b($res)/x;
+}
+
+
+sub _cleanword
+{
+ my $w = uc shift;
+ $w =~ tr/01/OI/; # de-leet any incoming words
+ my $last = ''; # remove duplicate letters (eg BOLLOCKS > BOLOCKS)
+ my @w;
+ for (split //, $w) {
+ next if $last eq $_;
+ $last = $_;
+ push @w, $_;
+ }
+ return @w ? join('', @w) : '';
+}
+
+sub add_regex
+{
+ my @list = split /\s+/, shift;
+ my @out;
+
+ for (@list) {
+ my $w = uc $_;
+ $w = _cleanword($w);
+
+ next unless $w && $w =~ /^\w+$/; # has to be a word
+ next if $in{$w}; # ignore any we have already dealt with
+ next if _slowcheck($w); # check whether this will already be detected
+
+ # re-leet word (in regex speak)if required
+ my @l = map { s/O/[O0]/g; s/I/[I1]/g; $_ } split //, $w;
+ my $e = join '+[\s\W]*', @l;
+ my $q = $e;
+ push @relist, [$w, $q];
+ $in{$w} = $q;
+ dbg("$w = $q") if isdbg('badword');
+ push @out, $w;
+ }
+ return @out;
+}
+
+sub del_regex
+{
+ my @list = split /\s+/, shift;
+ my @out;
+
+ for (@list) {
+ my $w = uc $_;
+ $w = _cleanword($w);
+ next unless $in{$w};
+ delete $in{$w};
+ @relist = grep {$_->[0] ne $w} @relist;
+ push @out, $w
+ }
+ return @out;
+}
+
+sub list_regex
+{
+ my $full = shift;
+ return map { $full ? "$_->[0] = $_->[1]" : $_->[0] } @relist;
+}
+
# check the text against the badwords list
sub check
{
my $s = uc shift;
my @out;
-
- push @out, &$regexcode($s) if $regexcode;
-
- return @out if @out;
- for (split(/\b/, $s)) {
- push @out, $_ if $badword->in($_);
+ if ($regex) {
+ my %uniq;
+ @out = grep {++$uniq{$_}; $uniq{$_} == 1 ? $_ : undef }($s =~ /\b($regex)/g);
+ dbg("BadWords: check '$s' = '" . join(', ', @out) . "'") if isdbg('badword');
+ return @out;
}
+ return _slowcheck($s) if @relist;
+ return;
+}
+
+sub _slowcheck
+{
+ my $w = shift;
+ my @out;
+
+ for (@relist) {
+ push @out, $w =~ /\b($_->[1])/;
+ }
return @out;
}
+# write out the new bad words list
+sub put
+{
+ my @out;
+ my $newfn = localdata("badword.new");
+ my $fh = new IO::File ">$newfn";
+ if ($fh) {
+ dbg("BadWords: put new badword.new file");
+ @relist = sort {$a->[0] cmp $b->[0]} @relist;
+ for (@relist) {
+ print $fh "$_->[0]\n";
+ }
+ $fh->close;
+ }
+ else {
+ my $l = "BadWords: can't open $newfn $!";
+ dbg($l);
+ push @out, $l;
+ return @out;
+ }
+}
1;
my $dxchan = shift;
my $line = shift;
- if (my @ans = BadWord::check($line)) {
+ if (my @ans = BadWords::check($line)) {
return ($dxchan->msg('e17', @ans));
}
push @{$self->{lines}}, $line;
my $no = shift;
my $line = shift;
- if (my @ans = BadWord::check($line)) {
+ if (my @ans = BadWords::check($line)) {
return ($dxchan->msg('e17', @ans));
}
${$self->{lines}}[$no] = $line;
UDPMsg::init(\&new_channel);
# load bad words
- dbg("load badwords: " . (BadWords::load() or "Ok"));
+ BadWords::load();
# prime some signals
unless ($DB::VERSION) {