-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrun.py
130 lines (107 loc) · 3.92 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from __future__ import print_function
import argparse
import json
import os
import re
import sys
import tarfile
import lxml.html
import requests
com_pt = re.compile(r'(?<!\\)%+(.+)')
multi_com_pt = re.compile(r'\\begin{comment}(.+?)\\end{comment}')
arxiv_id_pt = re.compile(r'(?<!\d)(\d{4}\.\d{5})(?!\d)')
url_base = 'https://arxiv.org/e-print/'
url_bib_base = 'http://adsabs.harvard.edu/cgi-bin/bib_query?arXiv:'
def get_all_arxiv_ids(text):
ids = []
for arxiv_id in arxiv_id_pt.findall(text):
ids.append(arxiv_id)
return ids
def download(url, dir_path='./'):
idx = os.path.split(url)[-1]
file_name = idx + '.tar.gz'
file_path = os.path.join(dir_path, file_name)
if os.path.exists(file_path):
return file_path
r = requests.get(url)
sys.stderr.write('\tdownload {}'.format(url) + '\n')
if r.status_code == 200:
with open(file_path, 'wb') as f:
f.write(r.content)
return file_path
else:
return 0
def read_papers(arxiv_ids, dir_path='./'):
results = {}
for arxiv_id in arxiv_ids:
sys.stderr.write('[{}]'.format(arxiv_id) + '\n')
result = read_paper(arxiv_id, dir_path)
if result:
if 'title' in result:
sys.stderr.write('\t({})'.format(result['title']) + '\n')
sys.stderr.write('\t {}'.format(' / '.join(result['authors'])) + '\n')
results[arxiv_id] = result
return results
def read_paper(arxiv_id, dir_path='./'):
url = url_base + arxiv_id
targz_path = download(url, dir_path)
url = url_bib_base + arxiv_id
bib_path = download(url, dir_path)
if not targz_path:
return []
else:
return read_tex_files(targz_path, bib_path)
def read_tex_files(file_path, bib_path=None):
results = {}
with tarfile.open(file_path, 'r') as tf:
for ti in tf:
if ti.name.endswith('.tex'):
with tf.extractfile(ti) as f:
comments = extract_comment(f)
if comments:
results[ti.name] = comments
if results and bib_path:
with open(bib_path) as f:
bib_data = extract_bibinfo(f)
results['authors'] = bib_data['authors']
results['title'] = bib_data['title']
return results
def extract_bibinfo(f):
info = {'title': '', 'authors': []}
dom = lxml.html.fromstring(f.read())
for c in dom.xpath('//meta'):
name = c.attrib.get('name', '')
if name == 'dc.title':
info['title'] = c.attrib['content']
elif name == 'dc.creator':
info['authors'].append(c.attrib['content'])
return info
def extract_comment(f):
results = []
for line_idx, line in enumerate(f.readlines()):
for comment in com_pt.findall(line.decode('utf-8')):
results.append(comment)
for comment in multi_com_pt.findall(f.read().decode('utf-8')):
results.append(comment)
return results
def main():
parser = argparse.ArgumentParser(description='Arxiv')
parser.add_argument('--text', '-t', help='text which contains arxiv ids')
parser.add_argument('--id', '-i', nargs='+', default=[])
parser.add_argument('--save-dir', '-s', default='./')
parser.add_argument('--output', '-o', default='./comments.json')
args = parser.parse_args()
sys.stderr.write(json.dumps(args.__dict__, indent=2) + '\n')
ids = args.id
if args.text:
sys.stderr.write('load text: {}'.format(args.text) + '\n')
ids.extend(get_all_arxiv_ids(open(args.text).read()))
ids = list(set(ids))
if not os.path.isdir(args.save_dir):
os.mkdir(args.save_dir)
sys.stderr.write('TARGET:\n' + '\n'.join('{} {}'.format(i, idx) for i, idx in enumerate(ids)) + '\n\n')
all_results = read_papers(ids, args.save_dir)
print(json.dumps(all_results, indent=2))
json.dump(all_results, open(args.output, 'w'), indent=2)
if __name__ == '__main__':
main()