Barsh4ec · Barsh4ec · Nov 19, 2023 · Nov 19, 2023 · Nov 19, 2023 · Nov 20, 2023
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/python-technologies-overview.iml b/.idea/python-technologies-overview.iml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/README.md b/README.md
@@ -1 +1,37 @@
-# python-technologies-overview
+# 📈 Djinni Vacancies Scraper and Statistics Generator
+## 👀 Overview
+This Python application is designed to scrape job vacancies from Djinni, a popular job board for IT professionals, and generate statistics based on job positions categorized as junior, middle, senior, and an overall summary of required technologies.
+
+## 🌟 Features
+- Web Scraping: Utilizes the Scrapy framework to crawl Djinni and extract relevant job vacancy information, including job title, required technologies, and experience level (junior, middle, senior).
+
+- Data Processing: Processes the scraped data to extract required technologies and categorizes job positions into junior, middle, and senior levels.
+
+- Statistics Generation: Generates statistics for each experience level (junior, middle, senior) and an overall summary of the required technologies.
+
+- Visualization: Creates bar plots to visually represent the technology requirements for each
+
+## 🚀 Getting Started
+Execute the following commands:
+```shell
+git clone https://github.com/Barsh4ec/python-technologies-overview.git
+python -m venv venv
+source venv/bin/activate # or venv\Scripts\activate in Windows
+pip install -r requirements.txt
+python main.py
+```
+After running **main.py** all the vacancies will be scraped and statistics charts will be created.
+You can find results in [this](/analytics) folder.
+
+## 💻 Example
+### Overall Statistics
+![image](/analytics/overall_vacancies.png)
+
+### Junior Statistics
+![image](/analytics/junior_vacancies.png)
+
+### Middle Statistics
+![image](/analytics/middle_vacancies.png)
+
+### Senior Statistics
+![image](/analytics/senior_vacancies.png)
diff --git a/analytics/junior_vacancies.png b/analytics/junior_vacancies.png
diff --git a/analytics/middle_vacancies.png b/analytics/middle_vacancies.png
diff --git a/analytics/overall_vacancies.png b/analytics/overall_vacancies.png
diff --git a/analytics/senior_vacancies.png b/analytics/senior_vacancies.png
diff --git a/main.py b/main.py
@@ -0,0 +1,32 @@
+import subprocess
+from datetime import datetime
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+
+subprocess.run(["scrapy", "crawl", "djinni_spider", "-O", "vacancies.csv"])
+
+ranks = ["Junior", "Middle", "Senior", "Overall"]
+vacancies = pd.read_csv("vacancies.csv")
+
+for rank in ranks:
+    if not rank == "Overall":
+        rank_vacancies = vacancies[vacancies["Rank"] == f"{rank}"]
+    else:
+        rank_vacancies = vacancies
+
+    technologies_df = rank_vacancies["Technologies"].str.split(",", expand=True)
+    technologies_stacked = technologies_df.stack().reset_index(level=1, drop=True)
+    technology_counts = technologies_stacked.value_counts()
+
+    technologies = technology_counts.index
+
+    plt.figure(figsize=(12, 6))
+    plt.bar(technologies, technology_counts)
+    plt.title(f"{rank} Vacancies requirements({datetime.now().date()})")
+    plt.xlabel("Technology")
+    plt.xticks(rotation=45)
+    plt.ylabel("Count")
+    plt.tight_layout()
+    plt.savefig(f"analytics/{rank.lower()}_vacancies.png", dpi=300)
diff --git a/requirements.txt b/requirements.txt
diff --git a/scrape_technologies/__init__.py b/scrape_technologies/__init__.py
diff --git a/scrape_technologies/items.py b/scrape_technologies/items.py
@@ -0,0 +1,12 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class ScrapeTechnologiesItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
diff --git a/scrape_technologies/middlewares.py b/scrape_technologies/middlewares.py
@@ -0,0 +1,103 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class ScrapeTechnologiesSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class ScrapeTechnologiesDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
diff --git a/scrape_technologies/pipelines.py b/scrape_technologies/pipelines.py
@@ -0,0 +1,13 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class ScrapeTechnologiesPipeline:
+    def process_item(self, item, spider):
+        return item
diff --git a/scrape_technologies/pull_technologies/__init__.py b/scrape_technologies/pull_technologies/__init__.py
diff --git a/scrape_technologies/pull_technologies/check_technologies.py b/scrape_technologies/pull_technologies/check_technologies.py
@@ -0,0 +1,46 @@
+def check_technologies(description: str) -> list:
+    result = []
+    if any(key in description.lower() for key in ["python", "python3"]):
+        result.append("Python")
+    if any(key in description.lower() for key in ["django"]):
+        result.append("Django")
+    if any(key in description.lower() for key in ["drf", "rest framework", " api "]):
+        result.append("DRF")
+    if any(key in description.lower() for key in ["fastapi"]):
+        result.append("FastAPI")
+    if any(key in description.lower() for key in ["flask"]):
+        result.append("Flask")
+    if any(key in description.lower() for key in ["git"]):
+        result.append("Git")
+    if any(key in description.lower() for key in ["celery"]):
+        result.append("Celery")
+    if any(key in description.lower() for key in ["sql"]):
+        result.append("SQL")
+    if any(key in description.lower() for key in [" orm", "object relational mapper", "sqlalchemy"]):
+        result.append("ORM")
+    if any(key in description.lower() for key in ["docker"]):
+        result.append("Docker")
+    if any(key in description.lower() for key in ["aws", "azure"]):
+        result.append("AWS/Azure")
+    if any(key in description.lower() for key in ["linux"]):
+        result.append("Linux")
+    if any(key in description.lower() for key in ["js", "javascript", "java script"]):
+        result.append("JS")
+    if any(key in description.lower() for key in ["react", "angular", " vue"]):
+        result.append("Frontend")
+    if any(key in description.lower() for key in ["oop", "solid"]):
+        result.append("OOP/SOLID")
+    if any(key in description.lower() for key in ["nosql"]):
+        result.append("NoSQL")
+    if any(key in description.lower() for key in ["networking", "udp", "tcp"]):
+        result.append("Networking")
+    if any(key in description.lower() for key in ["html", "css"]):
+        result.append("HTML/CSS")
+    if any(key in description.lower() for key in ["algorithm", "data structure"]):
+        result.append("Algorithms")
+    if any(key in description.lower() for key in ["async"]):
+        result.append("Asyncio")
+    if any(key in description.lower() for key in [" ml", "machine learning", "tensorflow", "keras"]):
+        result.append("ML")
+
+    return result