Skip to content
This repository was archived by the owner on Feb 28, 2019. It is now read-only.

Commit 96ca4d1

Browse files
committed
Automatic Domains Whitelist (Experimental)
1 parent 3c33e72 commit 96ca4d1

File tree

3 files changed

+71
-9
lines changed

3 files changed

+71
-9
lines changed

EasyWebsiteMirror.py

+43-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import zlib
1212
import gzip
1313
from time import time
14+
from fnmatch import fnmatch
1415
from html import escape as html_escape
1516
import threading
1617
from urllib.parse import urljoin, urlsplit, urlunsplit, quote_plus
@@ -44,7 +45,7 @@
4445
errprint('Can Not Create Local File Cache: ', e, ' local file cache is disabled automatically.')
4546
local_cache_enable = False
4647

47-
__VERSION__ = '0.18.6-dev'
48+
__VERSION__ = '0.19.0-dev'
4849
__author__ = 'Aploium <[email protected]>'
4950

5051
# ########## Basic Init #############
@@ -91,6 +92,9 @@
9192
if not isinstance(target_static_domains, set):
9293
target_static_domains = set()
9394

95+
if not enable_automatic_domains_whitelist:
96+
domains_whitelist_auto_add_glob_list = tuple()
97+
9498
if not enable_individual_sites_isolation:
9599
isolated_domains = set()
96100
else:
@@ -210,6 +214,40 @@
210214
#
211215

212216
# ########## Begin Utils #############
217+
@lru_cache(maxsize=8192)
218+
def is_domain_match_glob_whitelist(domain):
219+
for domain_glob in domains_whitelist_auto_add_glob_list:
220+
if fnmatch(domain, domain_glob):
221+
return True
222+
return False
223+
224+
225+
def try_match_and_add_domain_to_rewrite_white_list(domain):
226+
if domain is None or not domain:
227+
return False
228+
if domain in external_domains_set or domain == target_domain:
229+
return True
230+
if not is_domain_match_glob_whitelist(domain):
231+
return False
232+
else:
233+
infoprint('A domain:', domain, 'was added to whitelist')
234+
235+
global external_domains, external_domains_set, allowed_domains_set
236+
_buff = list(external_domains)
237+
_buff.append(domain)
238+
external_domains = tuple(_buff)
239+
external_domains_set.add(domain)
240+
allowed_domains_set.add(domain)
241+
242+
# write log
243+
try:
244+
with open('automatic_domains_whitelist.log', 'a', encoding='utf-8') as fp:
245+
fp.write(domain + '\n')
246+
except:
247+
traceback.print_exc()
248+
249+
return True
250+
213251

214252
def current_line_number():
215253
"""Returns the current line number in our program."""
@@ -580,6 +618,10 @@ def regex_url_reassemble(match_obj):
580618
# dbgprint('returned_un_touch', whole_match_string)
581619
return whole_match_string
582620

621+
# v0.19.0+ Automatic Domains Whitelist (Experimental)
622+
if enable_automatic_domains_whitelist:
623+
try_match_and_add_domain_to_rewrite_white_list(match_domain)
624+
583625
remote_domain, _is_remote_https, remote_path = extract_real_domain_from_url_may_have_extdomains()
584626
# dbgprint('remote_path:', remote_path, 'remote_domain:', remote_domain, 'match_domain', match_domain, v=5)
585627
# dbgprint(match_obj.groups(), v=5)

config.sample.py

+23-7
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@
7373
https='https://127.0.0.1:8123',
7474
)
7575

76+
# ############## Output Settings ##############
77+
# Verbose level (0~3) 0:important and error 1:info 2:warning 3:debug. Default is 3 (for first time runner)
78+
verbose_level = 3
79+
7680
# ############## Misc Settings ##############
7781
# v0.18.4+ for some modern websites (google/wiki, etc), we can assume it well always use utf-8 encoding.
7882
# or for some old-styled sites, we could also force the program to use gbk encoding (just for example)
@@ -84,18 +88,30 @@
8488
# 设置为 None 表示关闭显式编码指定, 'utf-8' 代表utf-8
8589
force_decode_remote_using_encode = None
8690

87-
# v0.18.5+
88-
# eg: {'access-control-max-age', 'access-control-allow-origin', 'x-connection-hash'}
89-
# must be lower case
90-
custom_allowed_remote_headers = {}
91+
# v0.19.0+ Automatic Domains Whitelist (Experimental)
92+
# by given wild match domains (glob syntax, '*.example.com'), if we got domains match these cases,
93+
# it would be automatically added to the `external_domains`
94+
# However, before you restart your server, you should check the 'automatic_domains_whitelist.log' file,
95+
# and manually add domains to the config, or it would not work after you restart your server
96+
# You CANNOT relay on the automatic whitelist, because the basic (but important) rewrite require specified domains to work.
97+
# For More Supported Pattern Please See: https://docs.python.org/3/library/fnmatch.html#module-fnmatch
98+
# 如果给定以通配符形式的域名, 当程序遇到匹配的域名时, 将会自动加入到 `external_domains` 的列表中
99+
# 但是, 当你重启服务器程序前, 请检查程序目录下 'automatic_domains_whitelist.log' 文件,
100+
# 并将里面的域名手动添加到 `external_domains` 的列表中 (因为程序不会在运行时修改本配置文件)
101+
# 自动域名添加白名单功能并不能取代 `external_domains` 中一个个指定的域名,
102+
# 因为基础重写(很重要)不支持使用通配符(否则会带来10倍以上的性能下降).
103+
# 如果需要使用 * 以外的通配符, 请查看 https://docs.python.org/3/library/fnmatch.html#module-fnmatch 这里的的说明
104+
enable_automatic_domains_whitelist = True
105+
domains_whitelist_auto_add_glob_list = ('*.google.com', '*.gstatic.com', '*.google.com.hk')
91106

92107
# #####################################################
93108
# ################# ADVANCED Settings #################
94109
# #####################################################
95110

96-
# ############## Output Settings ##############
97-
# Verbose level (0~3) 0:important and error 1:info 2:warning 3:debug. Default is 3 (for first time runner)
98-
verbose_level = 3
111+
# v0.18.5+
112+
# eg: {'access-control-max-age', 'access-control-allow-origin', 'x-connection-hash'}
113+
# must be lower case
114+
custom_allowed_remote_headers = {}
99115

100116
# ############## Cache Settings ##############
101117
# Cache remote static files to your local storge. And access them directly from local storge if necessary.

tests/regex_rewriter_test.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,11 @@
237237
new Image().src = "/url?sa=T&url=" + esc_link + "&oi=" + e(oi)+ "&ct=" + e(ct);return false;}
238238
</script></head><body><div class="_lFe"><div class="_kFe"><font style="font-size:larger"></div></div><div class="_jFe">&nb href="https://g.zju.tools:20822/extdomains/https-zh.wikipedia.org/zh-cn/%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91">https://g.zju.tools:20822/extdomains/https-zh.wikipedia.org/zh-cn/%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91</a><br>&nbsphref="#" onclick="return go_back();" onmousedown="ctu('unauthorizedredirect','originlink');><br></div></body></html> """,
239239
),
240-
240+
(
241+
r"""<a href="https://t.co/hWOMicwES0" rel="nofollow" dir="ltr" data-expanded-url="http://onforb.es/1NqvWJT" class="twitter-timeline-link" target="_blank" title="http://onforb.es/1NqvWJT"><span class="tco-ellipsis"></span><span class="invisible">http://</span><span class="js-display-url">onforb.es/1NqvWJT</span><span class="invisible"></span><span class="tco-ellipsis"><span class="invisible">&nbsp;</span></span></a>""",
242+
r"""<a href="https://t.co/hWOMicwES0" rel="nofollow" dir="ltr" data-expanded-url="http://onforb.es/1NqvWJT" class="twitter-timeline-link" target="_blank" title="http://onforb.es/1NqvWJT"><span class="tco-ellipsis"></span><span class="invisible">http://</span><span class="js-display-url">onforb.es/1NqvWJT</span><span class="invisible"></span><span class="tco-ellipsis"><span class="invisible">&nbsp;</span></span></a>""",
243+
r"""<a href="https://t.co/hWOMicwES0" rel="nofollow" dir="ltr" data-expanded-url="http://onforb.es/1NqvWJT" class="twitter-timeline-link" target="_blank" title="http://onforb.es/1NqvWJT"><span class="tco-ellipsis"></span><span class="invisible">http://</span><span class="js-display-url">onforb.es/1NqvWJT</span><span class="invisible"></span><span class="tco-ellipsis"><span class="invisible">&nbsp;</span></span></a>""",
244+
)
241245
)
242246
ColorfulPyPrint_set_verbose_level(5)
243247

0 commit comments

Comments
 (0)