-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
114 lines (88 loc) · 3.6 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from bs4 import BeautifulSoup
import requests
import json
def fetchMovieHtml(url):
try:
req = requests.get(url)
return req.text
except Exception as e:
print("Error making requests:", str(e))
return "null"
def handleFile(data):
try:
movieJsonFileRead = open("movies.json", "r")
movieData = movieJsonFileRead.read()
if movieData.strip() != "":
mvCont = json.loads(movieData)
movies = mvCont["movies"]
# for obj in data:
# if not any(m["movieName"] == obj["movieName"] for m in movies):
# movies.append(obj)
#* This is same has the other commented method above, but much reable.
for obj in data:
for movie in movies:
if movie["movieName"] == obj["movieName"]:
# If the movie already exists in the list, skip adding it
break
else:
# If the loop completes without finding a match, add the movie to the list
movies.append(obj)
formatedData = json.dumps(mvCont)
movieJsonFileWrite = open("movies.json", "w")
movieJsonFileWrite.write(formatedData)
else:
formatedData = { "movies": data }
movieJsonFileWrite = open("movies.json", "w")
movieJsonFileWrite.write(json.dumps(formatedData))
except Exception as e:
print("Error:", str(e))
def scrapeMovies(movieUrl):
# fetch page html
pageContent = fetchMovieHtml(movieUrl)
if pageContent == "null":
return None
# handle page content using bsoup
soup = BeautifulSoup(pageContent, "html.parser")
movieContainer = soup.find('div', class_='video-files')
movieArticles = soup.findAll("article", class_='file-one shadow')
# minipulate DOM (Document Object Model)
allmoviesInfo = []
for idx, movieArticle in enumerate(movieArticles):
img_element = movieArticle.find("img", src=True)
thumbnail_url = img_element['src']
movieInfo = movieArticle.find("div",class_='info')
vheading = movieInfo.find('h2')
vLink = vheading.find("a")
movieUrl = vLink['href']
movieName = vLink.text
movieData = {
"movie_thumbnail": thumbnail_url,
"movieName": movieName,
"movieUrl": movieUrl
}
allmoviesInfo.append(movieData)
handleFile(allmoviesInfo)
return soup
def paginatePage(maxPage):
for page in range(1, maxPage+1):
MOVIE_URL = "https://www.thenetnaija.net/videos/movies/page/{}".format(page)
# invoke the scrape function
soup = scrapeMovies(MOVIE_URL)
print(f"Scrapping {page}")
#* if you need to stop scrapping at a specified page count, comment this line below
if page == 3:
movieJsonFileRead = open("movies.json", "r")
movieData = movieJsonFileRead.read()
mvCont = json.loads(movieData)
movies = mvCont["movies"]
print(f"Total movies scrapped into 'movies.json' file : {len(movies)}")
break
# handle movie paginations
paginatedContainer = soup.find("ul", class_='pagination')
allPaginatedList = paginatedContainer.findAll("li")
lastPaginatedNextBtn = allPaginatedList[len(allPaginatedList) - 1]
link = lastPaginatedNextBtn.find('a', class_='next page-numbers')
if not link:
break
MAX_MOVIE_PAGE = 228
paginatePage(MAX_MOVIE_PAGE)