diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..63aaeb4 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,34 @@ +name: Publish Scribdl 🐍 distributions 📦 to PyPI + +on: push + +jobs: + build-n-publish: + name: Publish Scribdl 🐍 distributions 📦 to PyPI + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.x" + - name: Install pypa/build + run: >- + python -m + pip install + build + --user + - name: Build a binary wheel and a source tarball + run: >- + python -m + build + --sdist + --wheel + --outdir dist/ + . + + - name: Publish distribution 📦 to PyPI + if: startsWith(github.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/PKG-INFO b/PKG-INFO deleted file mode 100644 index 2699dca..0000000 --- a/PKG-INFO +++ /dev/null @@ -1,183 +0,0 @@ -Metadata-Version: 1.1 -Name: scribd-downloader -Version: 1.3.1 -Summary: Download documents, books and audiobooks off Scribd -Home-page: https://www.github.com/ritiek/scribd-downloader -Author: Ritiek Malhotra -Author-email: ritiekmalhotra123@gmail.com -License: MIT -Download-URL: https://github.com/ritiek/scribd-downloader/archive/v1.3.1.tar.gz -Description: Scribd-Downloader - ================= - - |PyPi Version| |Build Status| |Coverage Status| - - (I also found an online service https://dlscrib.com/ created by `Erik Fong`_. It doesn't - use this script as some people seem to think!). - - Current features: - - +------------+-------------------------------------+-------------------------------------------+ - | Type | Downloadable without Scribd premium | Requires Scribd premium for full download | - +============+=====================================+===========================================+ - | Documents | Yes | No | - +------------+-------------------------------------+-------------------------------------------+ - | Books | Yes | Yes | - +------------+-------------------------------------+-------------------------------------------+ - | Audiobooks | Yes | Yes | - +------------+-------------------------------------+-------------------------------------------+ - - **Some information about Scribd documents:** - - There are two types of documents on Scribd: - - - Documents made up using a collection of images and - - Actual documents where the text can be selected, copied etc. - - This script takes a different approach to both of them: - - - Documents consisting of a collection of images is straightforward and - this script will simply download the induvidual images which can - be combined to ``.pdf`` by passing ``--pdf`` option to the tool. Simple. - - - Actual documents where the text can be selected are hard to tackle. - If we feed such a document to this tool, only the text present in - document will be downloaded. Scribd seems to use javascript to somehow - combine text and images. So far, I haven't been able to combine them - with Python in a way they look like the original document. - - ------------ - Installation - ------------ - - Make sure you're using Python 3 (Python 2 is not supported by a few dependencies). - Then run these commands: - - :: - - $ pip install scribd-downloader - - or install the development version with: - - :: - - $ python setup.py install - - ----- - Usage - ----- - - :: - - usage: scribdl [-h] [-i] [-p] URL - - Download documents and books from scribd.com - - positional arguments: - URL scribd url to download - - optional arguments: - -h, --help show this help message and exit - -i, --images download url made up of images - -p, --pdf convert to pdf (*Nix: imagemagick) - -c CREDENTIALS_FILE, --credentials-file CREDENTIALS_FILE - path to file containing your Scribd premium - credentials - - -------- - Examples - -------- - - Scribd Documents - ---------------- - Downloading text from document containing selectable text: - :: - $ scribdl https://www.scribd.com/document/55949937/33-Strategies-of-War - - (Text will be saved side by side in a ``.md`` file in the current - working directory) - - Download document containing images; use the ``--images`` option (the tool cannot figure out this on its own): - :: - $ scribdl -i https://scribd.com/doc/17142797/Case-in-Point - - (Images will be saved in the current working directory) - - Scribd Books - ------------ - The below command will generate an ``.md`` file of the book in the current working directory: - :: - $ scribdl https://www.scribd.com/read/189087235/Confessions-of-a-Casting-Director-Help-Actors-Land-Any-Role-with-Secrets-from-Inside-the-Audition-Room - - Pass ``--pdf`` option to convert the generated output to a PDF. - - This will only dowload the book content available without owning a premium account on Scribd. - See the below section for downloading full books if you own a premium Scribd account. - - Scribd Audiobooks - ----------------- - This will download .mp3 of the audiobook: - :: - $ scribdl https://www.scribd.com/audiobook/237606860/100-Ways-to-Motivate-Yourself-Change-Your-Life-Forever - - This will only download the preview version of the audiobook. See the below section for - downloading complete audiobooks if you own a premium Scribd account. - - ------------------------------------------------- - Downloading complete textual books and audiobooks - ------------------------------------------------- - - If you have a premium Scribd account, you can also download the full version of - textual books and audiobooks. - - Create a text file containing your Scribd credentials, such that the contents of the file look like below: - :: - user@mail.com - password - - - Now pass the file path to the ``-c`` option, for example: - :: - $ scribdl -c scribd_credentials.txt https://www.scribd.com/audiobook/359295794/Principles-Life-and-Work - - It should then download you all the audiobook chapters as mp3. Similarly, you could also download complete - contents of a Scribd book by replacing the URL with the URL of your choice. - - If you're not willing to use place your account credentials in a file, you could also copy the cookie values - for ``_scribd_session`` and ``_scribd_expire`` when logged into your premium account on scribd on the web - browser and replace them with the ones in this file https://github.com/ritiek/scribd-downloader/blob/master/scribdl/const.py. - - You should then be able to automatically download full version of both textual books and audiobooks - from Scribd using the tool by running the commands as usual. - - ---------- - Disclaimer - ---------- - - Downloading books from Scribd for free maybe prohibited. This tool is - meant for educational purposes only. Please support the authors by buying - their titles. - - ------- - License - ------- - - ``The MIT License`` - - .. |PyPi Version| image:: https://img.shields.io/pypi/v/scribd-downloader.svg - :target: https://pypi.org/project/scribd-downloader - - .. |Build Status| image:: https://travis-ci.org/ritiek/scribd-downloader.svg?branch=master - :target: https://travis-ci.org/ritiek/scribd-downloader - - .. |Coverage Status| image:: https://codecov.io/gh/ritiek/scribd-downloader/branch/master/graph/badge.svg - :target: https://codecov.io/gh/ritiek/scribd-downloader - - .. _Mitmproxy: https://github.com/mitmproxy/mitmproxy - - .. _Erik Fong: mailto:dlscrib@gmail.com - .. _BookURL: https://www.scribd.com/read/189087235/Confessions-of-a-Casting-Director-Help-Actors-Land-Any-Role-with-Secrets-from-Inside-the-Audition-Room - .. ConstantValues: - -Keywords: scribd-downloader,documents,command-line,python -Platform: UNKNOWN diff --git a/scribd_downloader.egg-info/PKG-INFO b/scribd_downloader.egg-info/PKG-INFO deleted file mode 100644 index 2699dca..0000000 --- a/scribd_downloader.egg-info/PKG-INFO +++ /dev/null @@ -1,183 +0,0 @@ -Metadata-Version: 1.1 -Name: scribd-downloader -Version: 1.3.1 -Summary: Download documents, books and audiobooks off Scribd -Home-page: https://www.github.com/ritiek/scribd-downloader -Author: Ritiek Malhotra -Author-email: ritiekmalhotra123@gmail.com -License: MIT -Download-URL: https://github.com/ritiek/scribd-downloader/archive/v1.3.1.tar.gz -Description: Scribd-Downloader - ================= - - |PyPi Version| |Build Status| |Coverage Status| - - (I also found an online service https://dlscrib.com/ created by `Erik Fong`_. It doesn't - use this script as some people seem to think!). - - Current features: - - +------------+-------------------------------------+-------------------------------------------+ - | Type | Downloadable without Scribd premium | Requires Scribd premium for full download | - +============+=====================================+===========================================+ - | Documents | Yes | No | - +------------+-------------------------------------+-------------------------------------------+ - | Books | Yes | Yes | - +------------+-------------------------------------+-------------------------------------------+ - | Audiobooks | Yes | Yes | - +------------+-------------------------------------+-------------------------------------------+ - - **Some information about Scribd documents:** - - There are two types of documents on Scribd: - - - Documents made up using a collection of images and - - Actual documents where the text can be selected, copied etc. - - This script takes a different approach to both of them: - - - Documents consisting of a collection of images is straightforward and - this script will simply download the induvidual images which can - be combined to ``.pdf`` by passing ``--pdf`` option to the tool. Simple. - - - Actual documents where the text can be selected are hard to tackle. - If we feed such a document to this tool, only the text present in - document will be downloaded. Scribd seems to use javascript to somehow - combine text and images. So far, I haven't been able to combine them - with Python in a way they look like the original document. - - ------------ - Installation - ------------ - - Make sure you're using Python 3 (Python 2 is not supported by a few dependencies). - Then run these commands: - - :: - - $ pip install scribd-downloader - - or install the development version with: - - :: - - $ python setup.py install - - ----- - Usage - ----- - - :: - - usage: scribdl [-h] [-i] [-p] URL - - Download documents and books from scribd.com - - positional arguments: - URL scribd url to download - - optional arguments: - -h, --help show this help message and exit - -i, --images download url made up of images - -p, --pdf convert to pdf (*Nix: imagemagick) - -c CREDENTIALS_FILE, --credentials-file CREDENTIALS_FILE - path to file containing your Scribd premium - credentials - - -------- - Examples - -------- - - Scribd Documents - ---------------- - Downloading text from document containing selectable text: - :: - $ scribdl https://www.scribd.com/document/55949937/33-Strategies-of-War - - (Text will be saved side by side in a ``.md`` file in the current - working directory) - - Download document containing images; use the ``--images`` option (the tool cannot figure out this on its own): - :: - $ scribdl -i https://scribd.com/doc/17142797/Case-in-Point - - (Images will be saved in the current working directory) - - Scribd Books - ------------ - The below command will generate an ``.md`` file of the book in the current working directory: - :: - $ scribdl https://www.scribd.com/read/189087235/Confessions-of-a-Casting-Director-Help-Actors-Land-Any-Role-with-Secrets-from-Inside-the-Audition-Room - - Pass ``--pdf`` option to convert the generated output to a PDF. - - This will only dowload the book content available without owning a premium account on Scribd. - See the below section for downloading full books if you own a premium Scribd account. - - Scribd Audiobooks - ----------------- - This will download .mp3 of the audiobook: - :: - $ scribdl https://www.scribd.com/audiobook/237606860/100-Ways-to-Motivate-Yourself-Change-Your-Life-Forever - - This will only download the preview version of the audiobook. See the below section for - downloading complete audiobooks if you own a premium Scribd account. - - ------------------------------------------------- - Downloading complete textual books and audiobooks - ------------------------------------------------- - - If you have a premium Scribd account, you can also download the full version of - textual books and audiobooks. - - Create a text file containing your Scribd credentials, such that the contents of the file look like below: - :: - user@mail.com - password - - - Now pass the file path to the ``-c`` option, for example: - :: - $ scribdl -c scribd_credentials.txt https://www.scribd.com/audiobook/359295794/Principles-Life-and-Work - - It should then download you all the audiobook chapters as mp3. Similarly, you could also download complete - contents of a Scribd book by replacing the URL with the URL of your choice. - - If you're not willing to use place your account credentials in a file, you could also copy the cookie values - for ``_scribd_session`` and ``_scribd_expire`` when logged into your premium account on scribd on the web - browser and replace them with the ones in this file https://github.com/ritiek/scribd-downloader/blob/master/scribdl/const.py. - - You should then be able to automatically download full version of both textual books and audiobooks - from Scribd using the tool by running the commands as usual. - - ---------- - Disclaimer - ---------- - - Downloading books from Scribd for free maybe prohibited. This tool is - meant for educational purposes only. Please support the authors by buying - their titles. - - ------- - License - ------- - - ``The MIT License`` - - .. |PyPi Version| image:: https://img.shields.io/pypi/v/scribd-downloader.svg - :target: https://pypi.org/project/scribd-downloader - - .. |Build Status| image:: https://travis-ci.org/ritiek/scribd-downloader.svg?branch=master - :target: https://travis-ci.org/ritiek/scribd-downloader - - .. |Coverage Status| image:: https://codecov.io/gh/ritiek/scribd-downloader/branch/master/graph/badge.svg - :target: https://codecov.io/gh/ritiek/scribd-downloader - - .. _Mitmproxy: https://github.com/mitmproxy/mitmproxy - - .. _Erik Fong: mailto:dlscrib@gmail.com - .. _BookURL: https://www.scribd.com/read/189087235/Confessions-of-a-Casting-Director-Help-Actors-Land-Any-Role-with-Secrets-from-Inside-the-Audition-Room - .. ConstantValues: - -Keywords: scribd-downloader,documents,command-line,python -Platform: UNKNOWN diff --git a/scribd_downloader.egg-info/SOURCES.txt b/scribd_downloader.egg-info/SOURCES.txt deleted file mode 100644 index b763265..0000000 --- a/scribd_downloader.egg-info/SOURCES.txt +++ /dev/null @@ -1,32 +0,0 @@ -README.rst -setup.cfg -setup.py -scribd_downloader.egg-info/PKG-INFO -scribd_downloader.egg-info/SOURCES.txt -scribd_downloader.egg-info/dependency_links.txt -scribd_downloader.egg-info/entry_points.txt -scribd_downloader.egg-info/requires.txt -scribd_downloader.egg-info/top_level.txt -scribdl/__init__.py -scribdl/authorize.py -scribdl/command_line.py -scribdl/const.py -scribdl/downloader.py -scribdl/exceptions.py -scribdl/internals.py -scribdl/pdf_converter.py -scribdl/version.py -scribdl/content/__init__.py -scribdl/content/audiobook.py -scribdl/content/base.py -scribdl/content/book.py -scribdl/content/document.py -scribdl/content/test/__init__.py -scribdl/content/test/test_audiobook.py -scribdl/content/test/test_base.py -scribdl/content/test/test_book.py -scribdl/content/test/test_document.py -scribdl/test/__init__.py -scribdl/test/test_command_line.py -scribdl/test/test_download.py -scribdl/test/test_internals.py \ No newline at end of file diff --git a/scribd_downloader.egg-info/dependency_links.txt b/scribd_downloader.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/scribd_downloader.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/scribd_downloader.egg-info/entry_points.txt b/scribd_downloader.egg-info/entry_points.txt deleted file mode 100644 index e3b8b08..0000000 --- a/scribd_downloader.egg-info/entry_points.txt +++ /dev/null @@ -1,3 +0,0 @@ -[console_scripts] -scribdl = scribdl.command_line:_command_line - diff --git a/scribd_downloader.egg-info/requires.txt b/scribd_downloader.egg-info/requires.txt deleted file mode 100644 index 9c4fa0a..0000000 --- a/scribd_downloader.egg-info/requires.txt +++ /dev/null @@ -1,4 +0,0 @@ -requests>=2.19.1 -BeautifulSoup4>=4.6.3 -img2pdf>=0.3.1 -md2pdf>=0.4 diff --git a/scribd_downloader.egg-info/top_level.txt b/scribd_downloader.egg-info/top_level.txt deleted file mode 100644 index 88a066c..0000000 --- a/scribd_downloader.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -scribdl diff --git a/scribdl/authorize.py b/scribdl/authorize.py index 0fa7401..e69de29 100644 --- a/scribdl/authorize.py +++ b/scribdl/authorize.py @@ -1,60 +0,0 @@ -import requests -import json -from bs4 import BeautifulSoup -from . import const -from . import exceptions - -SCRIBD_LOGIN_URL = "https://www.scribd.com/login" - -SCRIBD_LOGIN_HEADERS = { - "X-Requested-With": "XMLHttpRequest" -} - -SCRIBD_LOGIN_DATA = { - "signup_location": "https://www.scribd.com/" -} - - -def set_credentials(filepath): - """ - Reads username and password for Scribd premium account - from the file passed and overrides the default values - for headers and cookies. - """ - login_page = requests.get(SCRIBD_LOGIN_URL) - login_cookies = login_page.cookies - - with open(filepath, "r") as in_file: - content = in_file.read() - username, password = content.split() - - SCRIBD_LOGIN_DATA["login_or_email"] = username - SCRIBD_LOGIN_DATA["login_password"] = password - - # - soup = BeautifulSoup(login_page.text, features="html5lib") - csrf = soup.find("meta", dict(name="csrf-token")) - if csrf: - SCRIBD_LOGIN_HEADERS["X-CSRF-Token"] = csrf.attrs['content'] - - response = requests.post(SCRIBD_LOGIN_URL, - headers=SCRIBD_LOGIN_HEADERS, - cookies=login_cookies, - json=SCRIBD_LOGIN_DATA) - - if response.status_code != 200: - raise exceptions.ScribdFetchError("Login failed with status " + str(response.status_code)) - - #print(response.text) - result = json.loads(response.text) - # {"login":true,"success":true,"user":{"id":514698173}} - if not "login" in result or not result["login"]: - # {"form_name":null,"errors":[{"input_name":"login_or_email","msg":"No account found with that email or username. Please try again or sign up."}]} - errors = result["errors"] - if errors: - raise exceptions.ScribdFetchError("Login error: " + errors[0]["msg"]) - - const.premium_cookies["_scribd_session"] = response.cookies["_scribd_session"] - const.premium_cookies["_scribd_expire"] = response.cookies["_scribd_expire"] - - return response diff --git a/scribdl/const.py b/scribdl/const.py index 4d18ef1..e69de29 100644 --- a/scribdl/const.py +++ b/scribdl/const.py @@ -1,8 +0,0 @@ -# Replace these values with ones generated for your web-browser when -# logged into a Scribd premium-account. This will allow access to -# full audiobooks. - -premium_cookies = { - "_scribd_session": "eyJzZXNzaW9uX2lkIjoiqmQxZTM0MjUzMzZhMTMzMzgwZTc2ODg5ZGQ3ZjVkNmEiLCJfY3NyZl90b2tlbiI6IlY4bmhMRXo4S2RFZ0g2TnF1amZwUHNIVFBWZ1Z0SVhHTGZNTkF1Ull2NEU9IiwiY2FydDMiOnsiY2FydF9pdGVtcyI6W3sidGl0bGUiOiJNb250aCBQYXNzIiwiZGVzY3JpcHRpb24iOiIxIG1vbnRoIHVubGltaXRlZCBhY2Nlc3MgdG8gU2NyaWJkIGRvY3VtZW50cyIsInByaWNlIjp7ImFtb3VudCI6ODk5LCJjdXJyZW5jeSI6IlVTRCJ9LCJwdXJjaGFzZV9jbGFzc19uYW1lIjoiUGF5bWVudHM6OkFyY2hpdmVQdXJjaGFzZSIsInB1cmNoYXNlX2F0dHJpYnV0ZXMiOnt9fV0sInN1bW1hcnkiOiJSZWN1cnJpbmcgTW9udGhseSBNZW1iZXJzaGlwIGZvciAkOC45OS9tb250aCIsInN1YnNjcmlwdGlvbiI6dHJ1ZSwicHJvZHVjdF9oYW5kbGUiOiJzY3JpYmQtcG1wLW1vbnRobHktdW5saW1pdGVkLXN1YnNjcmlwdGlvbi05MDAiLCJzdWJzY3JpcHRpb25fZHVyYXRpb24iOiIxLm1vbnRoIiwiY2hlY2tvdXRfdGl0bGUiOiJSZWFkIGJvb2tzLCBhdWRpb2Jvb2tzLCBhbmQgbW9yZSBvbiBhbnkgZGV2aWNlIiwiY2FsbGJhY2tfY2xhc3NfbmFtZSI6IkFyY2hpdmVGbG93Q29udHJvbGxlciIsImFmdGVyX2NoZWNrb3V0X2xpbmsiOnsibGFiZWwiOiJCYWNrIHRvIFNjcmliZCIsInVybCI6Imh0dHBzOi8vd3d3LnNjcmliZC5jb20vIn0sIm1ldGFkYXRhIjp7ImNvbnRleHQiOiJwbXAiLCJwYWdlIjoiaG9tZSIsImFjdGlvbiI6ImV4cGlyZWRfYm9va19wcmV2aWV3IiwicmVzdG9yZV9wYWdlIjoiaG9tZSIsInJlc3RvcmVfYWN0aW9uIjoiZXhwaXJlZF9ib29rX3ByZXZpZXciLCJsb2dnZWRfaW4iOnRydWUsInBsYXRmb3JtIjoid2ViIiwiZ2FfY2xpZW50X2lkIjoiMTQ0OTM2ODE5MS4xNTQ3NjQ3MDQxIn0sIm1lc3NhZ2UiOiJZb3UgaGF2ZSBjb21wbGV0ZWQgeW91ciBmcmVlIHRyaWFsIGFuZCB3aWxsIGJlIGNoYXJnZWQgJDguOTkvbW9udGggZm9yIHRoZSBtZW1iZXJzaGlwLiIsInBheXBhbF9kaXNhYmxlZCI6ZmFsc2UsImV4dGlyZXNfYXQiOjE1NDgzOTEzNjN9LCJyIjoiMTU0NzkwNDY2MiIsIndvcmRfaWQiOjc1OTU5OTY5LCJwIjoxMzg2NjM1MzA0LCJsYXN0X3JlYXV0aCI6MTU0NzkwNDY2Mn0%3D--c979f3a1da6282d3f497d45d0324b4802ebcfcdc", - "_scribd_expire": "1547904862", -} diff --git a/scribdl/content/base.py b/scribdl/content/base.py index 1af999f..e69de29 100644 --- a/scribdl/content/base.py +++ b/scribdl/content/base.py @@ -1,65 +0,0 @@ -from bs4 import BeautifulSoup -import requests -from abc import ABCMeta, abstractmethod -import six - -from .. import internals - - -@six.add_metaclass(ABCMeta) -class ScribdBase: - """ - A base class for Scribd books, documents and audiobooks. - - Parameters - ---------- - url : `str` - A string containing Scribd URL. - """ - - def __init__(self, url): - self.url = url - self._title = None - self._sanitized_title = None - self._hidden_soup = None - - @property - def title(self): - """ - Scrapes the title of the Scribd document. - """ - if not self._title: - title = self._soup.find("h1").get_text() - # this unneed prefix may happen on textual books - unneeded_prefix = "Currently Reading: " - if title.startswith(unneeded_prefix): - title = title[len(unneeded_prefix):] - self._title = title - return self._title - - @property - def sanitized_title(self): - """ - Remove special characters from title to make - it suitable for filenames. - """ - if not self._sanitized_title: - self._sanitized_title = internals.sanitize_title(self.title) - return self._sanitized_title - - @abstractmethod - def download(self): - """ - An abstract method for fetching content off Scribd book or document. - """ - pass - - @property - def _soup(self): - """ - Parse HTML. - """ - if not self._hidden_soup: - response = requests.get(self.url) - self._hidden_soup = BeautifulSoup(response.text, "html.parser") - return self._hidden_soup diff --git a/scribdl/content/document.py b/scribdl/content/document.py index 5083d43..e69de29 100644 --- a/scribdl/content/document.py +++ b/scribdl/content/document.py @@ -1,204 +0,0 @@ -from bs4 import BeautifulSoup -import requests - -import os - -from abc import abstractmethod -from .base import ScribdBase -from .. import internals - - -class ScribdDocument(ScribdBase): - """ - A base class for downloading documents off Scribd. - - Parameters - ---------- - url : `str` - A string containing Scribd document URL. - """ - - def __init__(self, document_url): - super().__init__(document_url) - self.url = document_url - self._jsonp_urls = None - self._hidden_soup = None - - @property - def jsonp_urls(self): - """ - Extracts all URLs ending with '.jsonp' by parsing the - HTML code. - """ - if not self._jsonp_urls: - js_text = self._soup.find_all("script", type="text/javascript") - jsonp_urls = [] - for opening in js_text: - for inner_opening in opening: - jsonp = self._extract_jsonp_url(inner_opening) - if jsonp: - jsonp_urls.append(jsonp) - self._jsonp_urls = jsonp_urls - return self._jsonp_urls - - def _extract_jsonp_url(self, inner_opening): - """ - Extracts URLs ending with '.jsonp'. These URLs contain the - raw document text. - """ - portion1 = inner_opening.find("https://") - - if portion1 == -1: - jsonp = None - else: - portion2 = inner_opening.find(".jsonp") - jsonp = inner_opening[portion1 : portion2 + 6] - - return jsonp - - @abstractmethod - def download(self): - """ - An abstract method which will fetch the actual content - found in the '.jsonp' URLs. - """ - pass - - -class ScribdTextualDocument(ScribdDocument): - """ - A class for downloading textual documents off Scribd. - - Parameters - ---------- - document_url : `str` - A string containing Scribd document URL. - """ - - def __init__(self, document_url): - super().__init__(document_url) - self.filename = self.sanitized_title + ".md" - - def download(self, filename=None): - """ - Generates the filename and processes the text extraction - to this file. - """ - if not filename: - filename = self.filename - - print("Extracting text to", self.sanitized_title, "\n") - self._text_extractor(filename) - return filename - - def _text_extractor(self, filename): - """ - Saves text from every '.jsonp' URL. - """ - for jsonp_url in self.jsonp_urls: - self._save_text(jsonp_url, filename) - - def _save_text(self, jsonp, filename): - """ - Makes a GET request to the '.jsonp' URL and saves - the text to the passed file. - """ - response = requests.get(jsonp).text - page_no = response[11:12] - - response_head = ( - (response) - .replace("window.page" + page_no + '_callback(["', "") - .replace("\\n", "") - .replace("\\", "") - .replace('"]);', "") - ) - soup_content = BeautifulSoup(response_head, "html.parser") - - for x in soup_content.find_all("span", {"class": "a"}): - xtext = internals.fix_encoding(x.get_text()) - print(xtext) - - extraction = xtext + "\n\n" - with open(filename, "a") as feed: - feed.write(extraction) - - -class ScribdImageDocument(ScribdDocument): - """ - A class for downloading image documents off Scribd. - - Parameters - ---------- - document_url : `str` - A string containing Scribd document URL. - """ - - def __init__(self, document_url): - super().__init__(document_url) - self._image_download_counter = 1 - - def download(self, initial_filename=None): - """ - Function for downloading images off '.jsonp' URLs to - filenames. - """ - if not initial_filename: - initial_filename = self.sanitized_title - - downloaded_html_images = self._html_image_extractor(initial_filename) - downloaded_jsonp_images = self._jsonp_image_extractor(initial_filename) - return downloaded_html_images + downloaded_jsonp_images - - def _jsonp_image_extractor(self, initial_filename): - """ - Extract images from extracted .jsonp URLs. - """ - downloaded_images = [] - found = self._image_download_counter > 1 - for jsonp_url in self.jsonp_urls: - filename = "{}_{}.jpg".format(initial_filename, self._image_download_counter) - img_url = self._convert_jsonp_url_to_image_url(jsonp_url, found=found) - self._save_image(img_url, filename) - downloaded_images.append(filename) - self._image_download_counter += 1 - return downloaded_images - - def _html_image_extractor(self, initial_filename): - """ - Extracts images that are directly embedded in the original - HTML page. - """ - downloaded_images = [] - absimg = self._soup.find_all("img", {"class": "absimg"}, src=True) - for img in absimg: - filename = "{}_{}.jpg".format(initial_filename, self._image_download_counter) - self._save_image(img["src"], filename) - downloaded_images.append(filename) - self._image_download_counter += 1 - return downloaded_images - - def _convert_jsonp_url_to_image_url(self, jsonp_url, found): - """ - Gets the image URL corresponding to the '.jsonp' URL. - """ - if jsonp_url.endswith(".jsonp"): - replacement = jsonp_url.replace("/pages/", "/images/") - if found: - replacement = replacement.replace(".jsonp", "/000.jpg") - else: - replacement = replacement.replace(".jsonp", ".jpg") - else: - replacement = jsonp_url - return replacement - - def _save_image(self, url, imagename): - """ - Skips downloading if the image is already downloaded, - otherwise downloads it locally. - """ - print("Downloading", imagename) - already_present = os.listdir(".") - if imagename in already_present: - return - internals.download_stream(url, imagename) diff --git a/scribdl/content/test/test_audiobook.py b/scribdl/content/test/test_audiobook.py index d918859..e69de29 100644 --- a/scribdl/content/test/test_audiobook.py +++ b/scribdl/content/test/test_audiobook.py @@ -1,87 +0,0 @@ -from .. import audiobook -from ... import exceptions - -import pytest - - -@pytest.fixture(scope="module") -def scribd_audiobook(): - return audiobook.ScribdAudioBook( - "https://www.scribd.com/audiobook/237606860/100-Ways-to-Motivate-Yourself-Change-Your-Life-Forever") - - -class TestScribdAudioBook: - def test_title(self, scribd_audiobook): - assert scribd_audiobook.title == "100 Ways to Motivate Yourself: Change Your Life Forever" - - def test_sanitized_title(self, scribd_audiobook): - assert scribd_audiobook.sanitized_title == "100_Ways_to_Motivate_Yourself__Change_Your_Life_Forever" - - def test_preview_url(self, scribd_audiobook): - assert scribd_audiobook.preview_url == "https://samples.findawayworld.com/19991/19991_sample.mp3" - - def test_scribd_id(self, scribd_audiobook): - assert scribd_audiobook.scribd_id == "237606860" - - def test_authenticate_url(self, scribd_audiobook): - assert scribd_audiobook.authenticate_url == "https://www.scribd.com/listen/237606860" - - def test_author_id(self, scribd_audiobook): - assert scribd_audiobook.author_id == None - - def test_book_id(self, scribd_audiobook): - assert scribd_audiobook.book_id == "19991" - - def test_playlist_url(self, scribd_audiobook): - assert scribd_audiobook.playlist_url == "https://api.findawayworld.com/v4/audiobooks/19991/playlists" - - def test_premium_cokies(self, scribd_audiobook): - assert scribd_audiobook.premium_cookies == False - - def test_license_url(self, scribd_audiobook): - assert scribd_audiobook.license_url == "https://api.findawayworld.com/v4/accounts/scribd-None/audiobooks/19991" - - def test_license_id(self, scribd_audiobook): - assert scribd_audiobook.license_id == None - - -class TestPlaylist: - def test_playlist_instance(self, scribd_audiobook): - assert isinstance(scribd_audiobook.playlist, audiobook.Playlist) - - def test_playlist_title(self, scribd_audiobook): - assert scribd_audiobook.playlist.title == "100 Ways to Motivate Yourself: Change Your Life Forever" - - def test_playlist_sanitized_title(self, scribd_audiobook): - assert scribd_audiobook.playlist.sanitized_title == "100_Ways_to_Motivate_Yourself__Change_Your_Life_Forever" - - def test_playlist_raw_content(self, scribd_audiobook): - raw_content = {'expires': None, - 'playlist': [{'chapter_number': 'preview', - 'part_number': 'preview', - 'url': 'https://samples.findawayworld.com/19991/19991_sample.mp3'}], - 'playlist_token': None} - assert scribd_audiobook.playlist._playlist == raw_content - - -class TestTrack: - def test_track_instance(self, scribd_audiobook): - assert isinstance(scribd_audiobook.playlist.tracks[0], audiobook.Track) - - def test_track_count(self, scribd_audiobook): - assert len(scribd_audiobook.playlist.tracks) == 1 - - def test_track_url(self, scribd_audiobook): - assert scribd_audiobook.playlist.tracks[0].url == "https://samples.findawayworld.com/19991/19991_sample.mp3" - - def test_track_part_number(self, scribd_audiobook): - assert scribd_audiobook.playlist.tracks[0].part_number == "preview" - - def test_track_chapter_number(self, scribd_audiobook): - assert scribd_audiobook.playlist.tracks[0].chapter_number == "preview" - - def test_track_raw_content(self, scribd_audiobook): - raw_content = {'chapter_number': 'preview', - 'part_number': 'preview', - 'url': 'https://samples.findawayworld.com/19991/19991_sample.mp3'} - assert scribd_audiobook.playlist.tracks[0]._track == raw_content diff --git a/scribdl/content/test/test_base.py b/scribdl/content/test/test_base.py index a481589..e69de29 100644 --- a/scribdl/content/test/test_base.py +++ b/scribdl/content/test/test_base.py @@ -1,26 +0,0 @@ -from .. import base - -import pytest - - -def test_abstract_class(): - with pytest.raises(TypeError): - x = base.ScribdBase() - - -class ScribdBaseTop(base.ScribdBase): - def download(self): - pass - - -class TestScribdBase: - @pytest.fixture(scope="class") - def scribd_base(self): - return ScribdBaseTop( - "https://www.scribd.com/audiobook/367877343/Intelligence-in-Nature-An-Inquiry-into-Knowledge") - - def test_title(self, scribd_base): - assert scribd_base.title == "Intelligence in Nature: An Inquiry into Knowledge" - - def test_sanitized_title(self, scribd_base): - assert scribd_base.sanitized_title == "Intelligence_in_Nature__An_Inquiry_into_Knowledge" diff --git a/scribdl/content/test/test_book.py b/scribdl/content/test/test_book.py index b0743a6..e69de29 100644 --- a/scribdl/content/test/test_book.py +++ b/scribdl/content/test/test_book.py @@ -1,26 +0,0 @@ -from .. import book - -import pytest - - -@pytest.fixture(scope="module") -def scribd_book(): - return book.ScribdBook( - "https://www.scribd.com/read/189087235/Confessions-of-a-Casting-Director-Help-Actors-Land-Any-Role-with-Secrets-from-Inside-the-Audition-Room") - - -class TestScribdBook: - def test_title(self, scribd_book): - assert scribd_book.title == "Confessions of a Casting Director: Help Actors Land Any Role with Secrets from Inside the Audition Room" - - def test_sanitized_title(self, scribd_book): - assert scribd_book.sanitized_title == "Confessions_of_a_Casting_Director__Help_Actors_Land_Any_Role_with_Secrets_from_Inside_the_Audition_Room" - - def test_filename(self, scribd_book): - assert scribd_book.filename == "Confessions_of_a_Casting_Director__Help_Actors_Land_Any_Role_with_Secrets_from_Inside_the_Audition_Room.md" - - def test_book_id(self, scribd_book): - assert scribd_book.book_id == 189087235 - - def test_url(self, scribd_book): - assert scribd_book.url == "https://www.scribd.com/read/189087235/Confessions-of-a-Casting-Director-Help-Actors-Land-Any-Role-with-Secrets-from-Inside-the-Audition-Room" diff --git a/scribdl/content/test/test_document.py b/scribdl/content/test/test_document.py index a2debfe..e69de29 100644 --- a/scribdl/content/test/test_document.py +++ b/scribdl/content/test/test_document.py @@ -1,43 +0,0 @@ -from .. import document - -import pytest - - -@pytest.fixture(scope="module") -def scribd_textual_document(): - return document.ScribdTextualDocument( - "https://www.scribd.com/document/55949937/33-Strategies-of-War") - - -@pytest.fixture(scope="module") -def scribd_image_document(): - return document.ScribdImageDocument( - "https://scribd.com/doc/17142797/Case-in-Point") - - -class TestScribdTextualDocument: - def test_title(self, scribd_textual_document): - assert scribd_textual_document.title == "33 Strategies of War" - - def test_sanitized_title(self, scribd_textual_document): - assert scribd_textual_document.sanitized_title == "33_Strategies_of_War" - - def test_url(self, scribd_textual_document): - assert scribd_textual_document.url == "https://www.scribd.com/document/55949937/33-Strategies-of-War" - - def test_jsonp_urls(self, scribd_textual_document): - assert len(scribd_textual_document.jsonp_urls) == 19 - - -class TestScribdImageDocument: - def test_title(self, scribd_image_document): - assert scribd_image_document.title == "Case in Point" - - def test_sanitized_title(self, scribd_image_document): - assert scribd_image_document.sanitized_title == "Case_in_Point" - - def test_url(self, scribd_image_document): - assert scribd_image_document.url == "https://scribd.com/doc/17142797/Case-in-Point" - - def test_jsonp_urls(self, scribd_image_document): - assert len(scribd_image_document.jsonp_urls) == 182 diff --git a/scribdl/exceptions.py b/scribdl/exceptions.py index 1a7da66..e69de29 100644 --- a/scribdl/exceptions.py +++ b/scribdl/exceptions.py @@ -1,12 +0,0 @@ -class ScribdFetchError(Exception): - """ - Handle exceptions for anything Scribd. - - Parameters - ---------- - message: `str` - Exception message. - """ - - def __init__(self, message): - super().__init__(message) diff --git a/scribdl/pdf_converter.py b/scribdl/pdf_converter.py index 2a010fc..e69de29 100644 --- a/scribdl/pdf_converter.py +++ b/scribdl/pdf_converter.py @@ -1,48 +0,0 @@ -from md2pdf.core import md2pdf -import img2pdf -import os - - -class ConvertToPDF: - """ - A class for converting downloading books and documents to PDF. - - Parameters - ---------- - input_content : `str`, `list` - A string containing path to a single markdown file - or a list containing paths to many images. - output_content : `str` - Output path of the generated PDF. - """ - - def __init__(self, input_content, output_path): - self.input_content = input_content - self.pdf_path = output_path - - def to_pdf(self): - """ - Converts to PDF depending upon the type of content, - i.e. images or markdown. - """ - if isinstance(self.input_content, list): - self._images_to_pdf() - else: - self._markdown_to_pdf() - - def _markdown_to_pdf(self): - """ - Converts markdown to PDF. - """ - md2pdf(self.pdf_path, - md_file_path=self.input_content, - base_url=os.getcwd()) - - def _images_to_pdf(self): - """ - Converts images to PDF. - """ - with open(self.pdf_path, "wb") as f: - open_images = [open(img, "rb") for img in self.input_content] - pdf_images = img2pdf.convert(open_images) - f.write(pdf_images) diff --git a/scribdl/test/test_command_line.py b/scribdl/test/test_command_line.py index 1ed0c6e..e69de29 100644 --- a/scribdl/test/test_command_line.py +++ b/scribdl/test/test_command_line.py @@ -1,44 +0,0 @@ -from .. import command_line -import sys - -import pytest - - -class TestCommandLine: - def test_empty(self): - args = [] - parser = command_line.get_arguments() - with pytest.raises(SystemExit): - parser.parse_args(args) - - def test_image_no_url(self): - args = [] - args.append("-i") - parser = command_line.get_arguments() - with pytest.raises(SystemExit): - parser.parse_args(args) - - def test_image_url(self): - args = [] - args.append("-i") - args.append("https://example.com/") - parser = command_line.get_arguments() - parsed_args = parser.parse_args(args) - assert parsed_args.images and not parsed_args.pdf - - def test_pdf_url(self): - args = [] - args.append("-p") - args.append("https://example.com/") - parser = command_line.get_arguments() - parsed_args = parser.parse_args(args) - assert not parsed_args.images and parsed_args.pdf - - def test_image_pdf_url(self): - args = [] - args.append("-i") - args.append("-p") - args.append("https://example.com/") - parser = command_line.get_arguments() - parsed_args = parser.parse_args(args) - assert parsed_args.images and parsed_args.pdf diff --git a/scribdl/test/test_internals.py b/scribdl/test/test_internals.py index f410beb..e69de29 100644 --- a/scribdl/test/test_internals.py +++ b/scribdl/test/test_internals.py @@ -1,15 +0,0 @@ -from .. import internals - -import pytest - - -SANITIZE_TITLE_TEST_TABLE = [ - ("good_title", "good_title"), - ("*bla", "_bla"), - ("**free_as_in_**", "__free_as_in__freedom___"), - ("troller*\"/\<>:|(haha)jojo", "troller_________haha_jojo"), -] - -@pytest.mark.parametrize("input_str, expected_str", SANITIZE_TITLE_TEST_TABLE) -def test_sanitize_title(input_str, expected_str): - assert internals.sanitize_title(input_str) == expected_str diff --git a/scribdl/version.py b/scribdl/version.py index 9c73af2..f708a9b 100644 --- a/scribdl/version.py +++ b/scribdl/version.py @@ -1 +1 @@ -__version__ = "1.3.1" +__version__ = "1.3.2" diff --git a/setup.py b/setup.py index bedcfc8..b88dfc4 100644 --- a/setup.py +++ b/setup.py @@ -10,17 +10,18 @@ with open("README.rst", "r") as f: long_description = f.read() -setup(name='scribd-downloader', +setup(name='scribd-download', version=__version__, description='Download documents, books and audiobooks off Scribd', - long_description=long_description, + long_description='Check long description in https://github.com/Phoenix124/scribd-downloader', + long_description_content_type='text/x-rst', author='Ritiek Malhotra', author_email='ritiekmalhotra123@gmail.com', packages = find_packages(), entry_points={ - 'console_scripts': [ - 'scribdl = scribdl.command_line:_command_line', - ] + 'console_scripts': [ + 'scribdl = scribdl.command_line:_command_line', + ] }, url='https://www.github.com/ritiek/scribd-downloader', keywords=['scribd-downloader', 'documents', 'command-line', 'python'], @@ -28,9 +29,9 @@ download_url='https://github.com/ritiek/scribd-downloader/archive/v' + __version__ + '.tar.gz', classifiers=[], install_requires=[ - 'requests >= 2.19.1', - 'BeautifulSoup4 >= 4.6.3', - 'img2pdf >= 0.3.1', - 'md2pdf >= 0.4' + 'requests >= 2.19.1', + 'BeautifulSoup4 >= 4.6.3', + 'img2pdf >= 0.3.1', + 'md2pdf >= 0.4' ] - ) + )