Write a python program that prompts the user for a web address (full url, html page) and then uses the BeautifulSoup and urllib to read its data.
You can use the any html page you'd like:
The program should output the following:
Remember:
GIVEN:
write a python program that prompts the user for the web address (full, url, html page ) and then uses the Beautifulsoap and urllib to read its data.
PROGRAM:
from urllib import urlopen
from bs4 import BeautifulSoup // header files
url = input("Enter the url: ")
r = request.get(url)
soup = BeautifulSoup(r.content) // getting input from user
text_p = (' '.join(s.findAll(text = True)) for s in soup.findAll('p'))
c_p = Counter((x.rstrip(punctuation).lower() for y in text_p for x in y.split()))
text_div = (' '.join(s.findAll(text = True)) for s in soup.findAll('div'))
c_div = Counter((x.rstrip(punctuation).lower() for y in text_div for x in y.split()))
total = c_div + c_p
print(total) // total number of words on page
c_H1 = c_H2 = c_H3 = c_H4 = c_H5 = c_H6 = 0
for tag in soup.findAll():
if(tag.name =="H1" or tag.name == "<H1>):
c_H1 = c_H1+1
if(tag.name =="H2" or tag.name == "<H2>):
c_H2 = c_H2+1
if(tag.name =="H3" or tag.name == "<H3>):
c_H3 = c_H3+1
if(tag.name =="H4" or tag.name == "<H4>):
c_H4 = c_H4+1
if(tag.name =="H5" or tag.name == "<H5>):
c_H5 = c_H5+1
if(tag.name =="H6" or tag.name == "<H6>):
c_H6 = c_H6+1 // header tags used on the page
print(c_H1)
print(c_H2)
print(c_H3)
print(c_H4)
print(c_H5)
print(c_H6)
count = 0
for tag in soup.findAll(): // total number of paragraphs on the page
if(tag.name == 'p' or tag.name == '<p>)
count = count+1
print(count)
print(len(soup.find_all('img')))
count = 0
for link in soup.find_all('a', href = True): // total number of links on the page
count = count+1
print(count)
title = soup.find("meta", property="og:title")
print(title)
des = soup.find("meta", property = "og:description")
print(des)
Get Answers For Free
Most questions answered within 1 hours.