1#! /usr/bin/env python
 2
 3"""Downloads all CHART project wiki pages in text format to output directory.
 4File attachments are not downloaded.
 5"""
 6
 7import os
 8import urllib
 9import urlparse
10from bs4 import BeautifulSoup
11# import etree.HTML
12
13from chart.project import settings
14from chart.common import ensure_dir_exists
15from chart.common.args import ArgumentParser
16
17def main():
18    """Command line entry point."""
19    parser = ArgumentParser()
20    parser.add_argument('--outdir', '-o',
21                        required=True,
22                        help='Output directory')
23    args = parser.parse_args()
24
25    ensure_dir_exists(args.outdir)
26
27    index_url = settings.PROJ_WIKI_URL + '/TitleIndex'
28    index_parse = urlparse.urlparse(index_url)
29
30    # page = etree.HTML(urllib.urlopen(index_url))
31    page = BeautifulSoup(urllib.urlopen(index_url))
32
33    for link in page.body.findAll('a'):
34        print link.attrs
35        # if (len(link.attrs) > 0 and
36            # link.attrs[0][0] == 'href' and
37            # link.attrs[0][1].startswith('/') and
38            # len(link.attrs[0][1]) > 1):
39        for k, v in link.attrs.iteritems():
40            if k != 'href':
41                continue
42
43            print('processing ' + v)
44            # continue
45
46            # convert the link to an absolute URL
47            url = index_parse.scheme + '://' + index_parse.netloc + v + '?format=txt'
48            # test if it is a wiki page
49            if url.startswith(settings.PROJ_WIKI_URL):
50                savename = os.path.join(args.outdir, v.split('/')[-1]+'.txt')
51                print 'downloading ' + url + ' as ' + savename
52
53                # if len(os.path.dirname(savename)) > 0:
54                    # ensure_dir_exists(os.path.dirname(savename))
55
56                open(savename, 'w').write(urllib.urlopen(url).read())
57
58if __name__ == '__main__':
59    main()