Merge kanmaytacker's fork into master

cardoso-neto · May 7, 2019 · dbb55e4 · dbb55e4
2 parents 544d34b + 051928b
commit dbb55e4
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 0 deletions.
diff --git a/http_parser/master_parser.py b/http_parser/master_parser.py
@@ -7,6 +7,17 @@
 class MasterParser:
 
     @staticmethod
+    '''This is a wrapper method which calls the ResponseParser method for parsing and retrieving the headers from the urlopen object retrieved from the URL. It then calls the PageParser method for the actual parsing. The project can as of now only decode UTF-8 encoding. Then, the results are stored in JSON with the attributes as url, status, headers and tags.
+
+:param url: The URL of webpage to be parsed to JSON
+:type url: string
+
+:param output_dir: Root directory where the JSON is to be stored
+:type output_dir: string
+
+:param output_file: The name the JSON file is to be given
+:type output_file: string
+'''
     def parse(url, output_dir, output_file):
         print('Crawling ' + url)
         resp = urlopen(Request(url, headers={'User-Agent': 'Mozilla/5.0'}))

diff --git a/http_parser/page_parser.py b/http_parser/page_parser.py
@@ -8,7 +8,14 @@ def __init__(self, html_string):
         self.soup = BeautifulSoup(html_string, 'html5lib')
         self.html = self.soup.find('html')
         self.all_tags = self.parse()
+'''
+This function returns a dictionary with the response headers of the page along with other meta-information. The utility of this function is that urlopen.info() returns these information as a mime tools.Message instance which isn’t as easy to use an dictionary. Each item from an .info() call is split and added to a dictionary as a key value pair
 
+:param response: The response object from a urlopen call to the URL of the webpage
+:type response: HTTPresponse object
+
+:returns a dictionary with response headers and other meta-information depending on the webpage.
+'''
     def parse(self):
         results = []
         for x, tag in enumerate(self.html.descendants):

diff --git a/http_parser/response_parser.py b/http_parser/response_parser.py
@@ -6,6 +6,14 @@ def __init__(self, response):
         self.response = response
         self.headers = self.parse()
 
+'''
+This function returns a list of dictionaries comprising of all the HTML tags in the webpage. The dictionary has the keys: attributes, content and name of the tags. The attributes would be of the tags such as content and name whereas as the name listed above is the name of the tag.
+
+param html_string: the HTML data of the requested webpage
+type html_string: string
+
+returns: list of dictionaries with all the tags 
+'''
     def parse(self):
         results = {}
         header_info = str(self.response.info()).split('\n')