-
Notifications
You must be signed in to change notification settings - Fork 1
/
configuration.py
115 lines (90 loc) · 3.88 KB
/
configuration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
"""\
This is a python port of "Goose" orignialy licensed to Gravity.com
under one or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.
Python port was written by Xavier Grangier for Recrutae
Gravity.com licenses this file
to you under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import os
import tempfile
from goose.text import StopWords
from goose.parsers import Parser
from goose.parsers import ParserSoup
from goose.version import __version__
HTTP_DEFAULT_TIMEOUT = 30
AVAILABLE_PARSERS = {
'lxml': Parser,
'soup': ParserSoup,
}
class Configuration(object):
def __init__(self):
# What's the minimum bytes for an image we'd accept is,
# alot of times we want to filter out the author's little images
# in the beginning of the article
self.images_min_bytes = 4500
# set this guy to false if you don't care about getting images,
# otherwise you can either use the default
# image extractor to implement the ImageExtractor
# interface to build your own
self.enable_image_fetching = True
# set this valriable to False if you want to force
# the article language. OtherWise it will attempt to
# find meta language and use the correct stopwords dictionary
self.use_meta_language = True
# default language
# it will be use as fallback
# if use_meta_language is set to false, targetlanguage will
# be use
self.target_language = 'en'
# defautl stopwrods class
self.stopwords_class = StopWords
# path to your imagemagick convert executable,
# on the mac using mac ports this is the default listed
self.imagemagick_convert_path = "/opt/local/bin/convert"
# path to your imagemagick identify executable
self.imagemagick_identify_path = "/opt/local/bin/identify"
# used as the user agent that
# is sent with your web requests to extract an article
# self.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2)"\
# " AppleWebKit/534.52.7 (KHTML, like Gecko) "\
# "Version/5.1.2 Safari/534.52.7"
self.browser_user_agent = 'Goose/%s' % __version__
# debug mode
# enable this to have additional debugging information
# sent to stdout
self.debug = False
# TODO
self.extract_publishdate = None
# TODO
self.additional_data_extractor = None
# Parser type
self.available_parsers = AVAILABLE_PARSERS.keys()
self.parser_class = 'lxml'
# set the local storage path
# make this configurable
self.local_storage_path = os.path.join(tempfile.gettempdir(), 'goose')
# http timeout
self.http_timeout = HTTP_DEFAULT_TIMEOUT
def get_parser(self):
return AVAILABLE_PARSERS[self.parser_class]
def get_additionaldata_extractor(self):
return self.additional_data_extractor
def set_additionaldata_extractor(self, extractor):
"""\
Pass in to extract any additional data not defined within
@param extractor a concrete instance of AdditionalDataExtractor
"""
if not extractor:
raise ValueError("extractor must not be null!")
self.additional_data_extractor = extractor