From f89ec1df3707d9897ffc2827297e9c9e4843f016 Mon Sep 17 00:00:00 2001 From: Michael Drake Date: Mon, 26 Sep 2016 10:15:05 +0100 Subject: Utils: Remove redundant genpubsuffix.pl (using libnspsl now). --- utils/genpubsuffix.pl | 378 -------------------------------------------------- 1 file changed, 378 deletions(-) delete mode 100644 utils/genpubsuffix.pl (limited to 'utils/genpubsuffix.pl') diff --git a/utils/genpubsuffix.pl b/utils/genpubsuffix.pl deleted file mode 100644 index 37a95e96e..000000000 --- a/utils/genpubsuffix.pl +++ /dev/null @@ -1,378 +0,0 @@ -# -# Public suffix C code generator -# -# Copyright 2015 Vincent Sanders -# -# Permission to use, copy, modify, and/or distribute this software for -# any purpose with or without fee is hereby granted, provided that the -# above copyright notice and this permission notice appear in all -# copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL -# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE -# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL -# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA -# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER -# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR -# PERFORMANCE OF THIS SOFTWARE. - - -# This program converts the public suffix list data [1] into a C -# program with static data representation and acessor function. -# -# The actual data list [2] should be placed in a file effective_tld_names.dat -# -# The C program is written to stdout, the typical 160K input file -# generates 500K of program and compiles down to a 100K object file -# -# There is a single exported function -# -# const char *getpublicsuffix(const char *hostname) -# -# This returns the public suffix of the passed hostname or NULL if -# there was an error processing the hostname. The returned pointer is -# within the passed hostname so if the returned pointer is the same as -# hostname the whole hostname is a public suffix otherwise the passed -# hostname has a private part. -# -# The resulting C file is mearly a conversion of the input data (the -# added c code is from this source and licenced under the same terms) -# and imposes no additional copyright above that of the source data -# file. -# -# Note: The pnode structure is built assuming there will never be more -# label nodes than can fit in an unsigned 16 bit value (65535) but as -# there are currently around 7500 nodes there is space for another -# 58,000 before this becomes an issue. -# -# [1] https://publicsuffix.org/ -# [2] https://publicsuffix.org/list/effective_tld_names.dat - - -# debian package for ordered hashes: libtie-ixhash-perl - -use strict; -use warnings; -use utf8; -use Tie::IxHash; - - -sub treesubdom -{ - my ($tldtree_ref, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref) = @_; - - my $domelem = pop @{$parts_ref}; - my $isexception = 0; - tie my %node, 'Tie::IxHash'; # this nodes hash - - # deal with explicit domain exceptions - $isexception = ($domelem =~ s/\A!//); - if ($isexception != 0) { - $node{"!"} = {}; - $$nodeidx_ref += 1; - } - - # Update string table - if (! exists $strtab_ref->{$domelem}) { - # add to string table - $strtab_ref->{$domelem} = $$stridx_ref; - { - use bytes; - # update the character count index - $$stridx_ref += length($domelem); - $$stridx_ref += 1; # terminator - } - - } - - # link new node list into tree - if (! exists $tldtree_ref->{$domelem}) { - $tldtree_ref->{$domelem} = \%node; - $$nodeidx_ref += 1; - } - - # recurse down if there are more parts to the domain - if (($isexception == 0) && (scalar(@{$parts_ref}) > 0)) { - treesubdom($tldtree_ref->{$domelem}, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref); - } -} - -sub phexstr -{ - use bytes; - - my ($str) = @_; - my $ret; - - my @bytes = unpack('C*', $str); - - $ret = $ret . sprintf("0x%02x, ", scalar(@bytes)); - - foreach (@bytes) { - $ret = $ret . sprintf("0x%02x, ", $_); - } - - return $ret; -} - -# generate all the children of a parent node and recurse into each of -# those updating optidx to point to the next free node -sub calc_pnode -{ - my ($parent_ref, $strtab_ref, $opidx_ref) = @_; - my $our_dat; - my $child_dat = ""; - my $startidx = $$opidx_ref; - my $lineidx = -1; - - # update the output index to after this node - $$opidx_ref += scalar keys %$parent_ref; - - # entry block - if ($startidx == ($$opidx_ref - 1)) { - $our_dat = "\n /* entry " . $startidx . " */\n "; - } else { - $our_dat = "\n /* entries " . $startidx . " to " . ($$opidx_ref - 1) . " */\n "; - } - - # iterate over each child element domain/ref pair - while ( my ($cdom, $cref) = each(%$parent_ref) ) { - # make array look pretty by limiting entries per line - if ($lineidx == 3) { - $our_dat .= "\n "; - $lineidx = 0; - } elsif ($lineidx == -1) { - $lineidx = 1; - } else { - $our_dat .= " "; - $lineidx += 1; - } - - $our_dat .= "{ "; - $our_dat .= $strtab_ref->{$cdom} . ", "; - my $child_count = scalar keys (%$cref); - $our_dat .= $child_count . ", "; - if ($child_count != 0) { - $our_dat .= $$opidx_ref; - $child_dat .= calc_pnode($cref, $strtab_ref, $opidx_ref); - } else { - $our_dat .= 0; - } - $our_dat .= " },"; - - } - - return $our_dat . $child_dat; -} - -# main -binmode(STDOUT, ":utf8"); - -my $filename = "effective_tld_names.dat"; - -open(my $fh, '<:encoding(UTF-8)', $filename) - or die "Could not open file '$filename' $!"; - -tie my %tldtree, 'Tie::IxHash'; # node tree -my $nodeidx = 1; # count of nodes allowing for the root node -tie my %strtab, 'Tie::IxHash'; # string table -my $stridx = 0; - -# put the wildcard match at 0 in the string table -$strtab{'*'} = $stridx; -$stridx += 2; - -# put the invert match at 2 in the string table -$strtab{'!'} = $stridx; -$stridx += 2; - -# read each line from prefix data and inject into hash tree -while (my $line = <$fh>) { - chomp $line; - - if (($line ne "") && ($line !~ /\/\/.*$/)) { - - # print "$line\n"; - my @parts=split("\\.", $line); - - # recusrsive call to build tree from root - - treesubdom(\%tldtree, \$nodeidx, \%strtab, \$stridx, \@parts); - } -} - -# C program header -print < -#include - -EOF - -# output string table -# -# array of characters each string is prefixed with its length and the -# node table below directly indexes emtries. As labels cannot be more -# than 63 characters a byte length is more than sufficient. - -print "static const char stab[" . $stridx . "] = {\n"; -while ( my ($key, $value) = each(%strtab) ) { - print " " . phexstr($key) . "/* " . $key . " " . $value . " */\n"; -} -print "};\n\n"; - -print "enum stab_entities {\n"; -print " STAB_WILDCARD = 0,\n"; -print " STAB_EXCEPTION = 2\n"; -print "};\n\n"; - - -# output static node array -# -# The constructed array of nodes has all siblings sequentialy and an -# index/count to its children. This yeilds a very compact data -# structure easily traversable. -# -# Additional flags for * (match all) and ! (exception) are omitted as -# they can be infered by having a node with a label of 0 (*) or 2 (!) -# as the string table has those values explicitly created. - -print "struct pnode {\n"; -print " uint32_t label; /* index of domain element in string table */\n"; -print " uint16_t child_count; /* number of children of this node */\n"; -print " uint16_t child_index; /* index of first child node */\n"; -print "};\n\n"; - -my $opidx = 1; # output index of node - -print "static const struct pnode pnodes[" . $nodeidx . "] = {\n"; - -# root node -print " /* root entry */\n { 0," . scalar keys(%tldtree) . ", " . $opidx . " },"; - -# all subsequent nodes -print calc_pnode(\%tldtree, \%strtab, \$opidx); -print "\n};\n\n"; - -# lookup code -print < hostname) && (*elem_start != DOMSEP)) { - elem_start--; - } - if (*elem_start == DOMSEP) { - elem_start++; - } - - lab_count++; - - /* search child nodes for label */ - treeidx = matchlabel(treeidx, elem_start, elem_end - elem_start); - if (treeidx == -1) { - break; - } - - if (elem_start == hostname) { - /* not valid */ - return NULL; - } - - elem_end = elem_start - 1; - elem_start = elem_end - 1; - } - - /* The public suffix algorithm says: "the domain must match - * the public suffix plus one additional label." This - * requires there to be at least two labels so we need to - * check - */ - if (lab_count == 1) { - if (elem_start == hostname) { - elem_start = NULL; - } else { - /* strip the non matching part */ - elem_start -= 2; - while (elem_start > hostname && *elem_start != DOMSEP) { - elem_start--; - } - if (*elem_start == DOMSEP) - elem_start++; - } - } - - - return elem_start; -} - - -EOF -- cgit v1.2.3