From d8abf00416d6838cc70c60cb4f1e2b3e66acb81b Mon Sep 17 00:00:00 2001 From: PromodhPinto <69188809+PromodhPinto@users.noreply.github.com> Date: Wed, 27 Jan 2021 18:09:39 +0530 Subject: [PATCH] Add files via upload Prothomalo code update --- prothomalo-crawler/scrape_links.py | 42 ++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 prothomalo-crawler/scrape_links.py diff --git a/prothomalo-crawler/scrape_links.py b/prothomalo-crawler/scrape_links.py new file mode 100644 index 0000000..baabbfc --- /dev/null +++ b/prothomalo-crawler/scrape_links.py @@ -0,0 +1,42 @@ + +import requests +import time +import sys +import os +import pandas as pd +from bs4 import BeautifulSoup + +'''Scrapes articles from each link, + breaks the entire content into sentences and appends as a line, + removes special characters to support allowed file names, + creates a directory and file to write each article as a new txt file + argv[0] --> filename, + argv[1] --> csv file path with links to scrape from, + argv[2] --> directory name.''' + +september = pd.read_csv(str(sys.argv[1])) + +filenames = september['Headline'] + +bad_chars = [';', ':', '!', '?', '|', '/', '"', '+', '<', '>', '.', "*"] + +start_time = time.time() + +for count, link in enumerate(september['Link']): + markup_string = requests.get(link, stream=True).content + soup = BeautifulSoup(markup_string, "html.parser") + content = soup.findAll(['p', 'h1', 'h2', 'figcaption']) + filename = ''.join(i for i in filenames[count] if not i in bad_chars) + + if not os.path.exists(str(sys.argv[2])): + os.makedirs(str(sys.argv[2])) + with open(os.path.join(str(sys.argv[2]), f"{count} {filename[:10]}.txt"), mode="w", encoding="utf-16") as file_w: + for text in range(len(content)): + file_w.write(content[text].text.strip() + "\r\n") + +print(' %s seconds ' % (time.time() - start_time)) + + + + +