Skip to content

Commit

Permalink
Merge pull request #76 from mediacloud/update-requirements
Browse files Browse the repository at this point in the history
Update requirements
  • Loading branch information
rahulbot authored Jan 22, 2024
2 parents e5aa45b + 05eb848 commit e15eb6a
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 4 deletions.
8 changes: 7 additions & 1 deletion mcmetadata/test/test_extract.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest
import datetime as dt
import time
import pytest

import mcmetadata
from .. import extract
Expand All @@ -11,6 +12,11 @@

class TestExtract(unittest.TestCase):

@pytest.fixture(autouse=True)
def slow_down_tests(self):
yield
time.sleep(0.5)

def setUp(self) -> None:
webpages.DEFAULT_TIMEOUT_SECS = 30 # try to avoid timeout errors

Expand Down Expand Up @@ -107,7 +113,7 @@ def test_other_metadata(self):
assert results['text_extraction_method'] == content.METHOD_TRAFILATURA
assert results['other']['raw_title'] == "India's 75th Year Of Freedom: Why Was August 15 Chosen As Independence Day?"
assert results['other']['raw_publish_date'] == dt.datetime(2022, 8, 14, 0, 0)
assert results['other']['top_image_url'] == "https://im.indiatimes.in/content/2022/Aug/flag_62f496a5df908.jpg"
assert results['other']['top_image_url'].startswith("https://im.indiatimes.in/content/2022/Aug/flag_62f496a5df908.jpg")
assert len(results['other']['authors']) == 1

def test_whitespace_removal(self):
Expand Down
7 changes: 7 additions & 0 deletions mcmetadata/test/test_webpages.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
import unittest
import requests
import pytest
import time

from .. import webpages


class TestFetch(unittest.TestCase):

@pytest.fixture(autouse=True)
def slow_down_tests(self):
yield
time.sleep(0.5)

def test_regular_fetch(self):
url = "https://web.archive.org/web/https://bostonglobe.com"
html, response = webpages.fetch(url)
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@

REQUIRED_PACKAGES = [
# for date guessing
"htmldate==1.6.*", "dateparser==1.2.*",
"htmldate==1.7.*", "dateparser==1.2.*",
# for domain name and URL extraction
"tldextract==5.1.*",
"url-normalize==1.4.*",
"furl==2.1.*",
# for language detection
"py3langid==0.2.*",
# various content extractors we try to use
"newspaper3k==0.2.*", "goose3==3.1.*", "BeautifulSoup4>=4.11,<4.13", "readability-lxml==0.8.*", "trafilatura>=1.4,<1.7",
"boilerpy3==1.0.*",
"newspaper3k==0.2.*", "goose3==3.1.*", "BeautifulSoup4>=4.11,<4.13", "readability-lxml==0.8.*",
"trafilatura>=1.4,<1.7", "boilerpy3==1.0.*",
# support
"requests", # leave un-versioned so dependencies can sort of which version is best
"faust-cchardet==2.1.*", # BeautifulSoup4 speedup
Expand Down

0 comments on commit e15eb6a

Please sign in to comment.