How to Create a Python Keyword Analyzer for SEO Optimization
By Ed Malaker
Keyword analysis is an essential part of bringing more traffic to your website, and it’s something that every content creator and SEO expert must master. Fortunately, there are some great tools available, and you can even build your own using the Python programming language, and that’s what we’re going to do right now.
Before You Get Started
This guide assumes that you have Python installed and know how to run the scripts. If you are new, check out our guide to getting started using Python.
We are going to explain each part of the script and put the full code at the end so that you can copy and paste it.
If you don’t already have it, you will need to install the beautifulsoup4, requests, and nltk libraries before you run the script using the following code at the command line:
pip install beautifulsoup4 requests nltk
Setting Up the Environment
Now we get to get started on the script, and the first thing we want to do is set up the environment with the libraries we just installed:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
Fetching Webpage Content
Next, we will use the requests library to get the content from a web page. It will take a URL as input, send an HTTP GET request to the specified URL, and return the HTML content of the page:
import requests
def fetch_content(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Failed to fetch content from {url}")
return ""
Clean and Tokenize Text
Next, we’ll clean the text by removing any punctuation and extra whitespace, then tokenize it to allow for further analysis and processing:
import re
from nltk.tokenize import word_tokenize
def clean_and_tokenize(text):
text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
tokens = word_tokenize(text.lower())
return tokens
Remove Stopwords
In the next step, we filter out common stop words. Stopwords are common words like “the,” “is,” “in,” etc., that do not contribute to the meaning of the content but can affect the keyword analysis:
from nltk.corpus import stopwords
def remove_stopwords(tokens):
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]
return filtered_tokens
Analyze Keyword Density
Now, we calculate the frequency and density of each keyword. Keyword density refers to the percentage of times a keyword appears in the text compared to the total number of words:
from collections import Counter
def analyze_keywords(tokens):
counter = Counter(tokens)
total_words = sum(counter.values())
keyword_density = {word: (count / total_words) * 100 for word, count in counter.items()}
return keyword_density
Generate the Report
With the content analyzed, we have Python print a report onscreen with the data:
def generate_report(url, keyword_density):
sorted_keywords = sorted(keyword_density.items(), key=lambda item: item[1], reverse=True)
report = f"Keyword Density Report for {url}\n"
report += "-" * 50 + "\n"
for keyword, density in sorted_keywords[:10]: # Display top 10 keywords
report += f"Keyword: {keyword}, Density: {density:.2f}%\n"
return report
Create the Main
With all of the separate functions in place, we can put them in a main function that takes a URL as input and prints the report when you run the script:
def main():
url = input("Enter the URL of the webpage: ")
html_content = fetch_content(url)
if html_content:
soup = BeautifulSoup(html_content, 'html.parser')
text = soup.get_text()
tokens = clean_and_tokenize(text)
filtered_tokens = remove_stopwords(tokens)
keyword_density = analyze_keywords(filtered_tokens)
report = generate_report(url, keyword_density)
print(report)
if __name__ == "__main__":
main()
Complete Code
Here is the complete code for you to copy and paste:
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk
import re
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
# Fetch webpage content
def fetch_content(url):
try:
response = requests.get(url)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx)
return response.text
except requests.exceptions.HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
except requests.exceptions.ConnectionError as conn_err:
print(f"Connection error occurred: {conn_err}")
except requests.exceptions.Timeout as timeout_err:
print(f"Timeout error occurred: {timeout_err}")
except requests.exceptions.RequestException as req_err:
print(f"An error occurred: {req_err}")
return ""
# Clean and tokenize text
def clean_and_tokenize(text):
text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
text = text.lower() # Convert text to lowercase
tokens = word_tokenize(text)
return tokens
# Remove stopwords from tokens
def remove_stopwords(tokens):
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]
return filtered_tokens
# Analyze keyword density
def analyze_keywords(tokens):
counter = Counter(tokens)
total_words = sum(counter.values())
keyword_density = {word: (count / total_words) * 100 for word, count in counter.items()}
return keyword_density
# Generate keyword density report
def generate_report(url, keyword_density):
sorted_keywords = sorted(keyword_density.items(), key=lambda item: item[1], reverse=True)
report = f"Keyword Density Report for {url}\n"
report += "-" * 50 + "\n"
for keyword, density in sorted_keywords[:10]: # Display top 10 keywords
report += f"Keyword: {keyword}, Density: {density:.2f}%\n"
return report
# Main function
def main():
url = input("Enter the URL of the webpage: ")
html_content = fetch_content(url)
if html_content:
soup = BeautifulSoup(html_content, 'html.parser')
text = soup.get_text()
tokens = clean_and_tokenize(text)
filtered_tokens = remove_stopwords(tokens)
keyword_density = analyze_keywords(filtered_tokens)
report = generate_report(url, keyword_density)
print(report)
input("Press Enter to exit...")
if __name__ == "__main__":
main()
Follow GeekSided for more helpful Python scripts and to leave comments and questions.