Module `fontai.io.scrappers`

This module contains logic that was used to scrape three font file sources: google, 1001fonts.com and dafont.com. As of May 2021 at least one of those sites have changed their url sources and so, some of these classes might not work anymore, and some work might be required to scrape the files again.

Expand source code

"""This module contains logic that was used to scrape three font file sources: google, 1001fonts.com and dafont.com. As of May 2021 at least one of those sites have changed their url sources and so, some of these classes might not work anymore, and some work might be required to scrape the files again.

"""
import time
import requests 
import re
import random
import typing as t
from abc import ABC, abstractmethod
import urllib.request
import logging
from pathlib import Path

from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)

__all__ = [
 "FreeFontsFileScrapper",
 "DafontsFileScrapper",
 "GoogleFontsScrapper"]
 
class Scrapper(ABC):

  """Interface implemented by web scrapper classes. Contains a single method, get_source_urls.
  """

  @abstractmethod
  def get_source_urls(self) -> t.Generator[str, None, None]:
    """Returns a generator of string objects pointing to each scrappable URL

    """
    pass

class GoogleFontsScrapper(Scrapper):

  """Retrieves the main zip file from Google fonts repository
  """
  
  def get_source_urls(self):
    yield "https://github.com/google/fonts/archive/main.zip"


class FreeFontsFileScrapper(Scrapper):

  """Retrieves font files from www.1001freefonts.com
  
  """
  
  def __init__(self):
    self.min_id = 0
    self.max_id = 27000

  def get_source_urls(self):
    
    for font_id in range(self.min_id,self.max_id):
      font_url = f"https://www.1001freefonts.com/d/{font_id}/"
      yield font_url


class DafontsFileScrapper(Scrapper):
  """
    Retrieves font files from www.dafont.com

  """
  def __init__(self):
    super().__init__("www.dafont.com")


  def get_source_urls(self):
    my_ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36"
    for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
      # parse html of first letter page
      letter_page_url = "https://www.dafont.com/alpha.php?lettre={l}&page=1&fpp=200".format(l=letter.lower())

      raw_html = requests.get(letter_page_url,headers = {"user-agent": my_ua}).text
      page_html = BeautifulSoup(raw_html,"html.parser")
      ## find number of letter pages
      letter_pages = page_html.find(class_="noindex").find_all("a")
      page_refs = [page["href"] for page in letter_pages]
      # get number of pages for current letter
      n_pages_rgx = re.compile("page=([0-9]+)")
      try:
      # if this happens, there is a single page
        n_pages = max([int(n_pages_rgx.search(x).group(1)) for x in page_refs])
      except Exception as e:
        logger.exception("an error occured while scrapping website (there mya be a single scrappable link) {e}".format(e=e))
        n_pages = 1
      for page_number in range(1,n_pages+1):
        page = "alpha.php?lettre={l}&page={k}&fpp=200".format(l=letter.lower(),k=page_number)
        if not ((letter == "A" and page_number in list(range(1,11)) + [20]) or (letter == "B" and page_number in list(range(1,11)) + [24])):
          logger.info("downloading page {p}".format(p=page))
          page_url = "https://www.dafont.com/" + page.replace("&amp;","")

          raw_html = requests.get(page_url,headers = {"user-agent": my_ua}).text
          #print("raw_html: {x}".format(x=raw_html))
          page_html = BeautifulSoup(raw_html,"html.parser")
          dl_links = page_html.findAll("a",{"class": "dl"})

          #print("dl_links {d}".format(d=dl_links))
          for link in dl_links:
            href = link["href"]
            # random sleep time 
            time.sleep(random.uniform(1,2))
            yield "https:" + href


class LocalScrapper(Scrapper):

  """
  Scrapper simulator from local files
  
  Attributes:
      folders (str): Source folder
  """
  def __init__(self, folder: str):
    self.folder = folder

  def get_source_urls(self):

    current = Path(self.folder)
    for file in current.iterdir():
      if file.is_file():
        yield str(file)

Classes

class DafontsFileScrapper

Retrieves font files from www.dafont.com

Expand source code

class DafontsFileScrapper(Scrapper):
  """
    Retrieves font files from www.dafont.com

  """
  def __init__(self):
    super().__init__("www.dafont.com")


  def get_source_urls(self):
    my_ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36"
    for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
      # parse html of first letter page
      letter_page_url = "https://www.dafont.com/alpha.php?lettre={l}&page=1&fpp=200".format(l=letter.lower())

      raw_html = requests.get(letter_page_url,headers = {"user-agent": my_ua}).text
      page_html = BeautifulSoup(raw_html,"html.parser")
      ## find number of letter pages
      letter_pages = page_html.find(class_="noindex").find_all("a")
      page_refs = [page["href"] for page in letter_pages]
      # get number of pages for current letter
      n_pages_rgx = re.compile("page=([0-9]+)")
      try:
      # if this happens, there is a single page
        n_pages = max([int(n_pages_rgx.search(x).group(1)) for x in page_refs])
      except Exception as e:
        logger.exception("an error occured while scrapping website (there mya be a single scrappable link) {e}".format(e=e))
        n_pages = 1
      for page_number in range(1,n_pages+1):
        page = "alpha.php?lettre={l}&page={k}&fpp=200".format(l=letter.lower(),k=page_number)
        if not ((letter == "A" and page_number in list(range(1,11)) + [20]) or (letter == "B" and page_number in list(range(1,11)) + [24])):
          logger.info("downloading page {p}".format(p=page))
          page_url = "https://www.dafont.com/" + page.replace("&amp;","")

          raw_html = requests.get(page_url,headers = {"user-agent": my_ua}).text
          #print("raw_html: {x}".format(x=raw_html))
          page_html = BeautifulSoup(raw_html,"html.parser")
          dl_links = page_html.findAll("a",{"class": "dl"})

          #print("dl_links {d}".format(d=dl_links))
          for link in dl_links:
            href = link["href"]
            # random sleep time 
            time.sleep(random.uniform(1,2))
            yield "https:" + href

Ancestors

fontai.io.scrappers.Scrapper
abc.ABC

Methods

def get_source_urls(self)

Returns a generator of string objects pointing to each scrappable URL

Expand source code

def get_source_urls(self):
  my_ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36"
  for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
    # parse html of first letter page
    letter_page_url = "https://www.dafont.com/alpha.php?lettre={l}&page=1&fpp=200".format(l=letter.lower())

    raw_html = requests.get(letter_page_url,headers = {"user-agent": my_ua}).text
    page_html = BeautifulSoup(raw_html,"html.parser")
    ## find number of letter pages
    letter_pages = page_html.find(class_="noindex").find_all("a")
    page_refs = [page["href"] for page in letter_pages]
    # get number of pages for current letter
    n_pages_rgx = re.compile("page=([0-9]+)")
    try:
    # if this happens, there is a single page
      n_pages = max([int(n_pages_rgx.search(x).group(1)) for x in page_refs])
    except Exception as e:
      logger.exception("an error occured while scrapping website (there mya be a single scrappable link) {e}".format(e=e))
      n_pages = 1
    for page_number in range(1,n_pages+1):
      page = "alpha.php?lettre={l}&page={k}&fpp=200".format(l=letter.lower(),k=page_number)
      if not ((letter == "A" and page_number in list(range(1,11)) + [20]) or (letter == "B" and page_number in list(range(1,11)) + [24])):
        logger.info("downloading page {p}".format(p=page))
        page_url = "https://www.dafont.com/" + page.replace("&amp;","")

        raw_html = requests.get(page_url,headers = {"user-agent": my_ua}).text
        #print("raw_html: {x}".format(x=raw_html))
        page_html = BeautifulSoup(raw_html,"html.parser")
        dl_links = page_html.findAll("a",{"class": "dl"})

        #print("dl_links {d}".format(d=dl_links))
        for link in dl_links:
          href = link["href"]
          # random sleep time 
          time.sleep(random.uniform(1,2))
          yield "https:" + href

class FreeFontsFileScrapper

Retrieves font files from www.1001freefonts.com

Expand source code

class FreeFontsFileScrapper(Scrapper):

  """Retrieves font files from www.1001freefonts.com
  
  """
  
  def __init__(self):
    self.min_id = 0
    self.max_id = 27000

  def get_source_urls(self):
    
    for font_id in range(self.min_id,self.max_id):
      font_url = f"https://www.1001freefonts.com/d/{font_id}/"
      yield font_url

Ancestors

fontai.io.scrappers.Scrapper
abc.ABC

Methods

def get_source_urls(self)

Returns a generator of string objects pointing to each scrappable URL

Expand source code

def get_source_urls(self):
  
  for font_id in range(self.min_id,self.max_id):
    font_url = f"https://www.1001freefonts.com/d/{font_id}/"
    yield font_url

class GoogleFontsScrapper

Retrieves the main zip file from Google fonts repository

Expand source code

class GoogleFontsScrapper(Scrapper):

  """Retrieves the main zip file from Google fonts repository
  """
  
  def get_source_urls(self):
    yield "https://github.com/google/fonts/archive/main.zip"

Ancestors

fontai.io.scrappers.Scrapper
abc.ABC

Methods

def get_source_urls(self)

Returns a generator of string objects pointing to each scrappable URL

Expand source code

def get_source_urls(self):
  yield "https://github.com/google/fonts/archive/main.zip"