From a5fba082ec91b76564bef9dc6705c4bdfce390f0 Mon Sep 17 00:00:00 2001 From: Jagadeesh L Date: Thu, 18 Mar 2021 10:47:56 +0530 Subject: [PATCH] Delete nativepalnet-crawler directory --- .../nativeplanet-crawler-tekenizer.py | 123 ------------------ 1 file changed, 123 deletions(-) delete mode 100644 nativepalnet-crawler/nativeplanet-crawler-tekenizer.py diff --git a/nativepalnet-crawler/nativeplanet-crawler-tekenizer.py b/nativepalnet-crawler/nativeplanet-crawler-tekenizer.py deleted file mode 100644 index 46e561f..0000000 --- a/nativepalnet-crawler/nativeplanet-crawler-tekenizer.py +++ /dev/null @@ -1,123 +0,0 @@ -import pandas as pd -import numpy as np -import re -import tqdm -from urllib.request import urlopen -from bs4 import BeautifulSoup -from selenium import webdriver -import datetime -import time -import argparse -import os -from indicnlp import common -from indicnlp.tokenize import sentence_tokenize -from indicnlp import common -import nltk -from nltk.tokenize import sent_tokenize - - - - - - -# input : link - -link='https://tamil.nativeplanet.com/haryana/attractions/#shaking-minarets' -lan='ta' #language of the text - -outputfile='textnative_haryana.txt' # name of the output file - - - - - -# urls scraping -chrome_path='/home/test/Downloads/chromedriver' -driver=webdriver.Chrome(chrome_path) -driver.get(link) -#driver.maximize_window() - -scroll=6000 - -time.sleep(10) -driver.execute_script("window.scrollTo(0, 1000000);") - -time.sleep(60) - -driver.execute_script("window.scrollTo(0, 1000000);") - -time.sleep(5) - -parent=driver.find_element_by_id('destionationsList') - -links1=parent.find_elements_by_tag_name("a") -href_links=[] -for el in links1: - if el.get_attribute('href') == None or not re.search('^http', el.get_attribute('href')): - pass - else: - href_links.append(el.get_attribute("href")) -l=list(set(href_links)) - -#data scraping -news_article=[] -for lin in tqdm.tqdm(l): - - try: - url = urlopen(lin) - content = url.read() - - soup = BeautifulSoup(content, 'lxml') - - - table = soup.findAll('div',attrs={"class":"np-article-content"}) - - for x in table: - f=x.find_all('p') - - - for j in f: - text=j.text.replace('\n',' ') - news_article.append(text) - except : - print('not able to get the text from this url '+lin) - -data=list(set(news_article)) - - -#tokenizer - -if lan!='en': - - sentences_one=[] - for sen in data: - indic_string=sen - - sentences=sentence_tokenize.sentence_split(indic_string, lang=lan) - - # print the sentences - for t in sentences: - sentences_one.append(t) -else: - - sentences_one=[] - for sen in data: - indic_string=sen - - # Split the sentence, language code "hi" is passed for hingi - sentences=sent_tokenize(sen) - - # print the sentences - for t in sentences: - sentences_one.append(t) -sentences1=list(set(sentences_one)) -sentences2=[sen for sen in sentences1 if len(sen)>6] -sentences3=[sen.strip(' \t*.') for sen in sentences2] - - - -with open(outputfile, 'w',encoding='utf-16') as file_handler: - for item in sentences3: - file_handler.write("{}\n".format(item)) - -