mirror of
https://bitbucket.org/anticensority/antizapret-pac-generator-light.git
synced 2025-04-07 07:03:33 +03:00
TEST: replace top words
This commit is contained in:
parent
e630f46ce2
commit
e98ac41a00
4 changed files with 1054 additions and 1 deletions
1000
dict/google-1000.txt
Normal file
1000
dict/google-1000.txt
Normal file
File diff suppressed because it is too large
Load diff
48
dict/topwords.py
Executable file
48
dict/topwords.py
Executable file
|
@ -0,0 +1,48 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
from pprint import pprint
|
||||
|
||||
wordhit = {}
|
||||
wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
|
||||
"K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
|
||||
"U", "V", "W", "X", "Y", "Z",
|
||||
"!", "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
|
||||
"=", "+", "/", ",", "<", ">", "~"]
|
||||
|
||||
with open(sys.argv[1], "r") as wfile:
|
||||
with open(sys.argv[2], "r") as dfile:
|
||||
domains = dfile.read().split("\n")
|
||||
words = wfile.read().split("\n")
|
||||
|
||||
new_domains = []
|
||||
for domain in domains:
|
||||
new_domains.append('.'.join(domain.split(".")[:-1]))
|
||||
domains = new_domains
|
||||
for word in words:
|
||||
wordhit[word] = 0
|
||||
|
||||
domain_len = len(domains)
|
||||
for i, domain in enumerate(domains):
|
||||
if (i % 1000) == 0:
|
||||
print(i, "/", domain_len, end="\r", file=sys.stderr)
|
||||
for word in words:
|
||||
if word in domain:
|
||||
wordhit[word] += 1
|
||||
|
||||
wordhit_c = {}
|
||||
for word in wordhit:
|
||||
value = wordhit[word]
|
||||
if value != 0 and word != '':
|
||||
wordhit_c[word] = value
|
||||
|
||||
wordhit_c = dict(sorted(wordhit_c.items(), key=lambda x: x[1]))
|
||||
|
||||
#print(wordhit_c)
|
||||
finallist = list(wordhit_c)[-43:]
|
||||
finallist = sorted(finallist, key=lambda x: 1000 - len(x))
|
||||
print(finallist, file=sys.stderr)
|
||||
print("{")
|
||||
for i, w in enumerate(finallist):
|
||||
print('gsub(/{}/, "{}", domainname)'.format(w, wordreplace[i]))
|
||||
print("}")
|
4
parse.sh
4
parse.sh
|
@ -7,7 +7,7 @@ HERE="$(dirname "$(readlink -f "${0}")")"
|
|||
cd "$HERE"
|
||||
|
||||
# Extract domains from list
|
||||
awk -F ';' '{print $2}' temp/list.csv | sort -u | awk '/^$/ {next} /\\/ {next} /^[а-яА-Яa-zA-Z0-9\-\_\.\*]*+$/ {gsub(/\*\./, ""); gsub(/\.$/, ""); print}' | CHARSET=UTF-8 idn > result/hostlist_original.txt
|
||||
awk -F ';' '{print $2}' temp/list.csv | sort -u | awk '/^$/ {next} /\\/ {next} /^[а-яА-Яa-zA-Z0-9\-\_\.\*]*+$/ {gsub(/\*\./, ""); gsub(/\.$/, ""); print}' | grep -Fv 'www.bеllonа.no' | CHARSET=UTF-8 idn > result/hostlist_original.txt
|
||||
|
||||
# Generate zones from domains
|
||||
# FIXME: nxdomain list parsing is disabled due to its instability on z-i
|
||||
|
@ -30,6 +30,8 @@ then
|
|||
awk -f scripts/getzones.awk temp/hostlist_original_with_include.txt | grep -v -F -x -f temp/exclude-hosts.txt | sort -u > result/hostlist_zones.txt
|
||||
fi
|
||||
|
||||
python dict/topwords.py dict/google-1000.txt result/hostlist_zones.txt > temp/replace-common-words.awk
|
||||
|
||||
# Generate a list of IP addresses
|
||||
awk -F';' '$1 ~ /\// {print $1}' temp/list.csv | grep -P '([0-9]{1,3}\.){3}[0-9]{1,3}\/[0-9]{1,2}' -o | sort -Vu > result/iplist_special_range.txt
|
||||
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
{
|
||||
domainzone = gensub(/(.*)\.([^.]+$)/, "\\2", 1)
|
||||
domainname = gensub(/(.*)\.([^.]+$)/, "\\1", 1)
|
||||
}
|
||||
@include "temp/replace-common-words.awk"
|
||||
{
|
||||
domainlength = length(domainname)
|
||||
domainarray[domainzone][domainlength][domainname] = domainname
|
||||
#print "adding", $0, ":", domainzone, domainlength, domainname
|
||||
|
|
Loading…
Add table
Reference in a new issue