1#! /usr/bin/env python
2
3"""Downloads all CHART project wiki pages in text format to output directory.
4File attachments are not downloaded.
5"""
6
7import os
8import urllib
9import urlparse
10from bs4 import BeautifulSoup
11# import etree.HTML
12
13from chart.project import settings
14from chart.common import ensure_dir_exists
15from chart.common.args import ArgumentParser
16
17def main():
18 """Command line entry point."""
19 parser = ArgumentParser()
20 parser.add_argument('--outdir', '-o',
21 required=True,
22 help='Output directory')
23 args = parser.parse_args()
24
25 ensure_dir_exists(args.outdir)
26
27 index_url = settings.PROJ_WIKI_URL + '/TitleIndex'
28 index_parse = urlparse.urlparse(index_url)
29
30 # page = etree.HTML(urllib.urlopen(index_url))
31 page = BeautifulSoup(urllib.urlopen(index_url))
32
33 for link in page.body.findAll('a'):
34 print link.attrs
35 # if (len(link.attrs) > 0 and
36 # link.attrs[0][0] == 'href' and
37 # link.attrs[0][1].startswith('/') and
38 # len(link.attrs[0][1]) > 1):
39 for k, v in link.attrs.iteritems():
40 if k != 'href':
41 continue
42
43 print('processing ' + v)
44 # continue
45
46 # convert the link to an absolute URL
47 url = index_parse.scheme + '://' + index_parse.netloc + v + '?format=txt'
48 # test if it is a wiki page
49 if url.startswith(settings.PROJ_WIKI_URL):
50 savename = os.path.join(args.outdir, v.split('/')[-1]+'.txt')
51 print 'downloading ' + url + ' as ' + savename
52
53 # if len(os.path.dirname(savename)) > 0:
54 # ensure_dir_exists(os.path.dirname(savename))
55
56 open(savename, 'w').write(urllib.urlopen(url).read())
57
58if __name__ == '__main__':
59 main()