feat: no need kindle

This commit is contained in:
yihong0618
2023-06-26 20:56:45 +08:00
parent 539454c959
commit 05369d5a1d
48 changed files with 31319 additions and 12 deletions

4
.gitignore vendored
View File

@@ -133,4 +133,6 @@ dmypy.json
# remove MacOS .DS_Store
.DS_Store
EPUB/
EPUB/
.device_id
.tokens

329
cli.py Normal file
View File

@@ -0,0 +1,329 @@
import argparse
import json
import logging
import os
import urllib3
from kindle_download_helper.config import (
DEFAULT_OUT_DEDRM_DIR,
DEFAULT_OUT_DIR,
DEFAULT_OUT_EPUB_DIR,
DEFAULT_SESSION_FILE,
)
from kindle_download_helper.kindle import Kindle
logger = logging.getLogger("kindle")
fh = logging.FileHandler(".error_books.log")
fh.setLevel(logging.ERROR)
logger.addHandler(fh)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# download selected books for cli
def download_selected_books(kindle, options):
# get all books and get the default device
print("Getting all books, please wait...")
books = kindle.get_all_books(filetype=options.filetype)
device = kindle.find_device()
# print all books
for idx, book in enumerate(books):
print(
"Index: "
+ "{:>5d}".format(idx + 1)
+ " | Title: "
+ book["title"]
+ " | asin: "
+ book["asin"]
)
# download loop
while True:
# get the indices of the books to download
indices = input(
"Input the index of books you want to download, split by space (q to quit, l to list books).\n"
).split()
# if input "q", quit
# if input "l", list all books again
if indices[0] == "q":
break
elif indices[0] == "l":
for idx, book in enumerate(books):
print(
"Index: "
+ "{:>5d}".format(idx + 1)
+ " | Title: "
+ book["title"]
+ " | asin: "
+ book["asin"]
)
continue
# decode the indices
downlist = []
flag = True
for idx in indices:
if idx.isnumeric() == False:
if ":" in idx:
# if is not a number, and ":" in it, then it is a range
# decode the range
idx_begin, idx_end = [int(i) for i in idx.split(":")]
# append the range to downlist
extend_list = [i for i in range(idx_begin - 1, idx_end)]
downlist.extend(extend_list)
else:
# if is not a number, and no ":" in it, then it is an error
print("Input error, please input numbers!!!")
flag = False
break
else:
# if is a number, then append it to downlist
downlist.append(int(idx) - 1)
if not flag:
continue
# remove the duplicate indices
downlist = list(set(downlist))
# check if the indices are valid
if max(downlist) >= len(books) or min(downlist) < 0:
print(
"Input error, please input numbers between 1 and "
+ str(len(books))
+ "!!!"
)
continue
# print the books to download
for idx in downlist:
print(
"Index: "
+ "{:>5d}".format(idx + 1)
+ " | Title: "
+ books[idx]["title"]
+ " | asin: "
+ books[idx]["asin"]
)
print("Downloading " + str(len(downlist)) + " books:")
# ask if to continue
while True:
flag = input("Continue? (y/n)")
if flag == "y" or flag == "n":
break
else:
print("Input error, please input y or n")
if flag == "n":
continue
# download the books
for i, idx in enumerate(downlist):
print(
"Downloading "
+ str(i + 1)
+ "/"
+ str(len(downlist))
+ " "
+ books[idx]["title"]
+ " ..."
)
kindle.download_one_book(books[idx], device, idx, filetype=options.filetype)
print("Download finished.")
def main():
logger.setLevel(os.environ.get("LOGGING_LEVEL", "INFO"))
logger.addHandler(logging.StreamHandler())
parser = argparse.ArgumentParser()
parser.add_argument("csrf_token", help="amazon or amazon cn csrf token", nargs="?")
cookie_group = parser.add_mutually_exclusive_group()
cookie_group.add_argument(
"--cookie", dest="cookie", default="", help="amazon or amazon cn cookie"
)
cookie_group.add_argument(
"--cookie-file", dest="cookie_file", default="", help="load cookie local file"
)
parser.add_argument(
"--cn",
dest="domain",
action="store_const",
const="cn",
default="com",
help="if your account is an amazon.cn account",
)
parser.add_argument(
"--jp",
dest="domain",
action="store_const",
const="jp",
default="com",
help="if your account is an amazon.co.jp account",
)
parser.add_argument(
"--de",
dest="domain",
action="store_const",
const="de",
default="com",
help="if your account is an amazon.de account",
)
parser.add_argument(
"--uk",
dest="domain",
action="store_const",
const="uk",
default="com",
help="if your account is an amazon.co.uk account",
)
parser.add_argument(
"--resume-from",
dest="index",
type=int,
default=1,
help="resume from the index if download failed",
)
parser.add_argument(
"--cut-length",
dest="cut_length",
type=int,
default=100,
help="truncate the file name",
)
parser.add_argument(
"-o", "--outdir", default=DEFAULT_OUT_DIR, help="dwonload output dir"
)
parser.add_argument(
"-od",
"--outdedrmdir",
default=DEFAULT_OUT_DEDRM_DIR,
help="dwonload output dedrm dir",
)
parser.add_argument(
"-oe",
"--outepubmdir",
default=DEFAULT_OUT_EPUB_DIR,
help="dwonload output epub dir",
)
parser.add_argument(
"-s",
"--session-file",
default=DEFAULT_SESSION_FILE,
help="The reusable session dump file",
)
parser.add_argument(
"--pdoc",
dest="filetype",
action="store_const",
const="PDOC",
default="EBOK",
help="to download personal documents or ebook",
)
parser.add_argument(
"--resolve_duplicate_names",
dest="resolve_duplicate_names",
action="store_true",
help="Resolve duplicate names files to download",
)
parser.add_argument(
"--readme",
dest="readme",
action="store_true",
help="If you want to generate kindle readme stats",
)
parser.add_argument(
"--dedrm",
dest="dedrm",
action="store_true",
help="If you want to `dedrm` directly",
)
parser.add_argument(
"--list",
dest="list_only",
action="store_true",
help="just list books/pdoc, not to download",
)
parser.add_argument(
"--device_sn",
dest="device_sn",
default="",
help="Download file for device with this serial number",
)
parser.add_argument(
"--mode",
dest="mode",
default="all",
help="Mode of download, all: download all files at once, sel: download selected files",
)
options = parser.parse_args()
if not os.path.exists(options.outdir):
os.makedirs(options.outdir)
# for dedrm
if not os.path.exists(options.outdedrmdir):
os.makedirs(options.outdedrmdir)
# for epub
if not os.path.exists(options.outepubmdir):
os.makedirs(options.outepubmdir)
kindle = Kindle(
options.csrf_token,
options.domain,
options.outdir,
options.outdedrmdir,
options.outepubmdir,
options.cut_length,
session_file=options.session_file,
device_sn=options.device_sn,
)
# other args
kindle.to_resolve_duplicate_names = options.resolve_duplicate_names
kindle.dedrm = options.dedrm
if options.cookie_file:
with open(options.cookie_file, "r") as f:
kindle.set_cookie_from_string(f.read())
elif options.cookie:
kindle.set_cookie_from_string(options.cookie)
else:
kindle.is_browser_cookie = True
if options.list_only:
kindle.get_devices()
print(
json.dumps(
kindle.get_all_books(filetype=options.filetype),
indent=4,
ensure_ascii=False,
)
)
exit()
if options.readme:
# generate readme stats
kindle.make_kindle_stats_readme()
else:
# check the download mode
if options.mode == "all":
# download all books
kindle.download_books(
start_index=options.index - 1, filetype=options.filetype
)
elif options.mode == "sel":
# download selected books
download_selected_books(kindle, options)
else:
print("mode error, please input all or sel")
if __name__ == "__main__":
main()

View File

@@ -1,2 +1,3 @@
from kindle_download_helper.cli import main
from kindle_download_helper import kindle
from kindle_download_helper.cli import main
from kindle_download_helper.no_cli import no_main

View File

@@ -0,0 +1,319 @@
import base64
import datetime
import gzip
import hashlib
import hmac
import json
import os
import secrets
import sys
import time
import uuid
from urllib.parse import urlparse
import requests
import xmltodict
from Crypto.Cipher import AES
from Crypto.Hash import SHA256
from Crypto.Protocol.KDF import PBKDF2
from Crypto.PublicKey import RSA
from Crypto.Signature import pkcs1_15
SCRIPT_PATH = os.path.dirname(os.path.realpath(sys.argv[0]))
DEVICE_ID_PATH = os.path.join(SCRIPT_PATH, ".device_id")
TOKENS_PATH = os.path.join(SCRIPT_PATH, ".tokens")
if os.path.isfile(DEVICE_ID_PATH):
with open(DEVICE_ID_PATH, "r") as f:
DEVICE_ID = f.read()
else:
with open(DEVICE_ID_PATH, "w") as f:
DEVICE_ID = secrets.token_hex(16)
f.write(DEVICE_ID)
PID = hashlib.sha256(DEVICE_ID.encode()).hexdigest()[23:31].upper()
def save_tokens(tokens):
with open(TOKENS_PATH, "w") as f:
f.write(json.dumps(tokens))
def get_tokens():
if os.path.isfile(TOKENS_PATH):
with open(TOKENS_PATH, "r") as f:
return json.loads(f.read())
else:
return None
APP_NAME = "com.iconology.comix"
APP_VERSION = "1221328936"
DEVICE_NAME = "walleye/google/Pixel 2"
DEVICE_TYPE = "A2A33MVZVPQKHY"
MANUFACTURER = "Google"
OS_VERSION = "google/walleye/walleye:8.1.0/OPM1.171019.021/4565141:user/release-keys"
PFM = "A1F83G8C2ARO7P"
SW_VERSION = "1221328936"
def get_auth_headers(domain):
return {
"Accept-Charset": "utf-8",
"User-Agent": "Dalvik/2.1.0 (Linux; U; Android 10; Pixel 2 Build/OPM1.171019.021)",
"x-amzn-identity-auth-domain": f"api.amazon.{domain}",
"x-amzn-requestid": str(uuid.uuid4()).replace("-", ""),
}
def get_api_headers():
return {
"accept": "*/*",
"accept-encoding": "gzip",
"accept-language": "en-US",
"currenttransportmethod": "WiFi",
"is_archived_items": "1",
"software_rev": SW_VERSION,
"user-agent": "okhttp/3.12.1",
"x-adp-app-id": APP_NAME,
"x-adp-app-sw": SW_VERSION,
"x-adp-attemptcount": "1",
"x-adp-cor": "US",
"x-adp-country": "US",
"x-adp-lto": "0",
"x-adp-pfm": PFM,
"x-adp-reason": "ArchivedItems",
"x-adp-sw": SW_VERSION,
"x-adp-transport": "WiFi",
"x-amzn-accept-type": "application/x.amzn.digital.deliverymanifest@1.0",
}
def generate_frc(device_id):
cookies = json.dumps(
{
"ApplicationName": APP_NAME,
"ApplicationVersion": APP_VERSION,
"DeviceLanguage": "en",
"DeviceName": DEVICE_NAME,
"DeviceOSVersion": OS_VERSION,
"IpAddress": requests.get("https://api.ipify.org").text,
"ScreenHeightPixels": "1920",
"ScreenWidthPixels": "1280",
"TimeZone": "00:00",
}
)
def pkcs7_pad(data):
padsize = 16 - len(data) % 16
return data + bytes([padsize]) * padsize
compressed = gzip.compress(cookies.encode())
key = PBKDF2(device_id, b"AES/CBC/PKCS7Padding")
iv = secrets.token_bytes(16)
cipher = AES.new(key, AES.MODE_CBC, iv)
ciphertext = cipher.encrypt(pkcs7_pad(compressed))
hmac_ = hmac.new(
PBKDF2(device_id, b"HmacSHA256"), iv + ciphertext, hashlib.sha256
).digest()
return base64.b64encode(b"\0" + hmac_[:8] + iv + ciphertext).decode()
def login(email, password, domain="com", device_id=DEVICE_ID):
tokens = get_tokens()
if tokens and tokens["name"] == hashlib.md5(email.encode()).hexdigest():
return refresh(tokens)
body = {
"auth_data": {
"use_global_authentication": "true",
"user_id_password": {"password": password, "user_id": email},
},
"registration_data": {
"domain": "DeviceLegacy",
"device_type": DEVICE_TYPE,
"device_serial": device_id,
"app_name": APP_NAME,
"app_version": APP_VERSION,
"device_model": DEVICE_NAME,
"os_version": OS_VERSION,
"software_version": SW_VERSION,
},
"requested_token_type": [
"bearer",
"mac_dms",
"store_authentication_cookie",
"website_cookies",
],
"cookies": {"domain": f"amazon.{domain}", "website_cookies": []},
"user_context_map": {"frc": generate_frc(device_id)},
"device_metadata": {
"device_os_family": "android",
"device_type": DEVICE_TYPE,
"device_serial": device_id,
"manufacturer": MANUFACTURER,
"model": DEVICE_NAME,
"os_version": "30",
"android_id": "e97690019ccaab2b",
"product": DEVICE_NAME,
},
"requested_extensions": ["device_info", "customer_info"],
}
response_json = requests.post(
f"https://api.amazon.{domain}/auth/register",
headers=get_auth_headers(domain),
json=body,
).json()
try:
tokens = {
"name": hashlib.md5(
email.encode()
).hexdigest(), # to differentiate tokens from different accounts
"domain": domain,
"device_id": device_id,
"access_token": response_json["response"]["success"]["tokens"]["bearer"][
"access_token"
],
"refresh_token": response_json["response"]["success"]["tokens"]["bearer"][
"refresh_token"
],
"device_private_key": response_json["response"]["success"]["tokens"][
"mac_dms"
]["device_private_key"],
"adp_token": response_json["response"]["success"]["tokens"]["mac_dms"][
"adp_token"
],
}
return register_device(tokens)
except:
print(json.dumps(response_json))
return None
def refresh(tokens):
body = {
"app_name": APP_NAME,
"app_version": APP_VERSION,
"source_token_type": "refresh_token",
"source_token": tokens["refresh_token"],
"requested_token_type": "access_token",
}
response_json = requests.post(
f"https://api.amazon.com/auth/token",
headers=get_auth_headers(tokens["domain"]),
json=body,
).json()
try:
tokens["access_token"] = response_json["access_token"]
except:
print(json.dumps(response_json))
return tokens
def signed_request(
method,
url,
headers=None,
body=None,
asin=None,
tokens=None,
request_id=None,
request_type=None,
):
"""
modified from https://github.com/mkb79/Audible/blob/master/src/audible/auth.py
"""
if not tokens:
tokens = get_tokens()
if not tokens:
print("Could not retrieve auth tokens")
return None
elif "adp_token" not in tokens:
print("Could not find the adp token in tokens")
return None
elif "device_private_key" not in tokens:
print("Could not find the private key in tokens")
return None
if not request_id:
request_id = str(uuid.uuid4()).replace("-", "")
else:
request_id += str(int(time.time())) + "420"
if not body:
body = ""
date = datetime.datetime.utcnow().isoformat("T")[:-7] + "Z"
u = urlparse(url)
path = f"{u.path}"
if u.query != "":
path += f"{u.params}?{u.query}"
data = f"{method}\n{path}\n{date}\n{body}\n{tokens['adp_token']}"
key = RSA.import_key(base64.b64decode(tokens["device_private_key"]))
signed_encoded = base64.b64encode(pkcs1_15.new(key).sign(SHA256.new(data.encode())))
signature = f"{signed_encoded.decode()}:{date}"
if not headers:
headers = get_api_headers()
if asin:
headers["x-adp-correlationid"] = f"{asin}-{int(time.time())}420.kindle.ebook"
if request_type == "DRM_VOUCHER":
headers["accept"] = "application/x-com.amazon.drm.Voucher@1.0"
headers.update(
{
"x-adp-token": tokens["adp_token"],
"x-adp-alg": "SHA256WithRSA:1.0",
"x-adp-signature": signature,
"x-amzn-requestid": request_id,
}
)
return requests.Request(method, url, headers, data=body).prepare()
def register_device(tokens=None):
if not tokens:
tokens = get_tokens()
url = "https://firs-ta-g7g.amazon.com/FirsProxy/registerAssociatedDevice"
headers = {
"Content-Type": "text/xml",
"Expect": "",
}
body = f"<?xml version=\"1.0\" encoding=\"UTF-8\"?><request><parameters><deviceType>{DEVICE_TYPE}</deviceType><deviceSerialNumber>{tokens['device_id']}</deviceSerialNumber><pid>{PID}</pid><deregisterExisting>false</deregisterExisting><softwareVersion>{SW_VERSION}</softwareVersion><softwareComponentId>{APP_NAME}</softwareComponentId><authToken>{tokens['access_token']}</authToken><authTokenType>ACCESS_TOKEN</authTokenType></parameters></request>"
resp = requests.Session().send(
signed_request("POST", url, headers, body, tokens=tokens)
)
if resp.status_code == 200:
parsed_response = xmltodict.parse(resp.text)
tokens["device_private_key"] = parsed_response["response"]["device_private_key"]
tokens["adp_token"] = parsed_response["response"]["adp_token"]
save_tokens(tokens)
return tokens
if __name__ == "__main__":
arg_count = len(sys.argv)
if arg_count != 4:
print("usage: amazon_auth.py <email> <password> <domain>")
print("domains: com, co.uk, co.jp, de")
exit()
tokens = login(sys.argv[1], sys.argv[2], sys.argv[3])
if tokens == None:
print("Could not login!")
else:
print(json.dumps(tokens))

View File

@@ -68,3 +68,12 @@ MY_KINDLE_STATS_INFO = "- I bought {books_len} books\n- I pushed {pdocs_len} doc
KINDLE_TABLE_HEAD = "| ID | Title | Authors | Acquired | Read | \n | ---- | ---- | ---- | ---- | ---- |\n"
KINDLE_STAT_TEMPLATE = "| {id} | {title} | {authors} | {acquired} | {read} |\n"
API_MANIFEST_URL = (
"https://kindle-digital-delivery.amazon.com/delivery/manifest/kindle.ebook/"
)
API_HEADERS = {
"User-Agent": "Comics/3.10.17[3.10.17.310418] Google/10",
"x-client-application": "com.comixology.comics",
}

View File

@@ -1,2 +1,3 @@
from .kfxdedrm import KFXZipBook
from .kgenpids import get_pid_list
from .mobidedrm import MobiBook

View File

@@ -0,0 +1,383 @@
import base64
import hmac
import json
import logging
import os
import pathlib
import struct
from hashlib import sha256
from typing import Dict, Optional, Tuple, Union
from pbkdf2 import PBKDF2
from pyaes import AESModeOfOperationCBC, Decrypter, Encrypter
logger = logging.getLogger("kindle.aescipher")
BLOCK_SIZE: int = 16 # the AES block size
def aes_cbc_encrypt(
key: bytes, iv: bytes, data: str, padding: str = "default"
) -> bytes:
"""Encrypts data in cipher block chaining mode of operation.
Args:
key: The AES key.
iv: The initialization vector.
data: The data to encrypt.
padding: Can be ``default`` or ``none`` (Default: default)
Returns:
The encrypted data.
"""
encrypter = Encrypter(AESModeOfOperationCBC(key, iv), padding=padding)
encrypted = encrypter.feed(data) + encrypter.feed()
return encrypted
def aes_cbc_decrypt(
key: bytes, iv: bytes, encrypted_data: bytes, padding: str = "default"
) -> bytes:
"""Decrypts data encrypted in cipher block chaining mode of operation.
Args:
key: The AES key used at encryption.
iv: The initialization vector used at encryption.
encrypted_data: The encrypted data to decrypt.
padding: Can be ``default`` or ``none`` (Default: default)
Returns:
The decrypted data.
"""
decrypter = Decrypter(AESModeOfOperationCBC(key, iv), padding=padding)
decrypted = decrypter.feed(encrypted_data) + decrypter.feed()
return decrypted
def create_salt(salt_marker: bytes, kdf_iterations: int) -> Tuple[bytes, bytes]:
"""Creates the header and salt for the :func:`derive_from_pbkdf2` function.
The header consist of the number of KDF iterations encoded as a big-endian
word bytes wrapped by ``salt_marker`` on both sides.
The random salt has a length of 16 bytes (the AES block size) minus the
length of the salt header.
"""
header = salt_marker + struct.pack(">H", kdf_iterations) + salt_marker
salt = os.urandom(BLOCK_SIZE - len(header))
return header, salt
def pack_salt(header: bytes, salt: bytes) -> bytes:
"""Combines the header and salt created by :func:`create_salt` function."""
return header + salt
def unpack_salt(packed_salt: bytes, salt_marker: bytes) -> Tuple[bytes, int]:
"""Unpack salt and kdf_iterations from previous created and packed salt."""
mlen = len(salt_marker)
hlen = mlen * 2 + 2
if not (
packed_salt[:mlen] == salt_marker
and packed_salt[mlen + 2 : hlen] == salt_marker
):
raise ValueError("Check salt_marker.")
kdf_iterations = struct.unpack(">H", packed_salt[mlen : mlen + 2])[0]
salt = packed_salt[hlen:]
return salt, kdf_iterations
def derive_from_pbkdf2(
password: str, *, key_size: int, salt: bytes, kdf_iterations: int, hashmod, mac
) -> bytes:
"""Creates an AES key with the :class:`PBKDF2` key derivation class."""
kdf = PBKDF2(password, salt, min(kdf_iterations, 65535), hashmod, mac)
return kdf.read(key_size)
class AESCipher:
"""Encrypt/Decrypt data using password to generate key.
The encryption algorithm used is symmetric AES in cipher-block chaining
(CBC) mode.
The key is derived via the PBKDF2 key derivation function (KDF) from the
password and a random salt of 16 bytes (the AES block size) minus the
length of the salt header (see below).
The hash function used by PBKDF2 is SHA256 per default. You can pass a
different hash function module via the ``hashmod`` argument. The module
must adhere to the Python API for Cryptographic Hash Functions (PEP 247).
PBKDF2 uses a number of iterations of the hash function to derive the key,
which can be set via the ``kdf_iterations`` keyword argument. The default
number is 1000 and the maximum 65535.
The header and the salt are written to the first block of the encrypted
output (bytes mode) or written as key/value pairs (dict mode). The header
consist of the number of KDF iterations encoded as a big-endian word bytes
wrapped by ``salt_marker`` on both sides. With the default value of
``salt_marker = b'$'``, the header size is thus 4 and the salt 12 bytes.
The salt marker must be a byte string of 1-6 bytes length.
The last block of the encrypted output is padded with up to 16 bytes, all
having the value of the length of the padding.
All values in dict mode are written as base64 encoded string.
Attributes:
password: The password for encryption/decryption.
key_size: The size of the key. Can be ``16``, ``24`` or ``32``
(Default: 32).
salt_marker: The salt marker with max. length of 6 bytes (Default: $).
kdf_iterations: The number of iterations of the hash function to
derive the key (Default: 1000).
hashmod: The hash method to use (Default: sha256).
mac: The mac module to use (Default: hmac).
Args:
password: The password for encryption/decryption.
key_size: The size of the key. Can be ``16``, ``24`` or ``32``
(Default: 32).
salt_marker: The salt marker with max. length of 6 bytes (Default: $).
kdf_iterations: The number of iterations of the hash function to
derive the key (Default: 1000).
hashmod: The hash method to use (Default: sha256).
mac: The mac module to use (Default: hmac).
Raises:
ValueError: If `salt_marker` is not one to six bytes long.
ValueError: If `kdf_iterations` is greater than 65535.
TypeError: If type of `salt_marker` is not bytes.
"""
def __init__(
self,
password: str,
*,
key_size: int = 32,
salt_marker: bytes = b"$",
kdf_iterations: int = 1000,
hashmod=sha256,
mac=hmac
) -> None:
if not 1 <= len(salt_marker) <= 6:
raise ValueError("The salt_marker must be one to six bytes long.")
if not isinstance(salt_marker, bytes):
raise TypeError("salt_marker must be a bytes instance.")
if kdf_iterations >= 65536:
raise ValueError("kdf_iterations must be <= 65535.")
self.password = password
self.key_size = key_size
self.hashmod = hashmod
self.mac = mac
self.salt_marker = salt_marker
self.kdf_iterations = kdf_iterations
def _encrypt(self, data: str) -> Tuple[bytes, bytes, bytes]:
header, salt = create_salt(self.salt_marker, self.kdf_iterations)
key = derive_from_pbkdf2(
password=self.password,
key_size=self.key_size,
salt=salt,
kdf_iterations=self.kdf_iterations,
hashmod=self.hashmod,
mac=self.mac,
)
iv = os.urandom(BLOCK_SIZE)
encrypted_data = aes_cbc_encrypt(key, iv, data)
return pack_salt(header, salt), iv, encrypted_data
def _decrypt(self, salt: bytes, iv: bytes, encrypted_data: bytes) -> str:
try:
salt, kdf_iterations = unpack_salt(salt, self.salt_marker)
except ValueError:
kdf_iterations = self.kdf_iterations
key = derive_from_pbkdf2(
password=self.password,
key_size=self.key_size,
salt=salt,
kdf_iterations=kdf_iterations,
hashmod=self.hashmod,
mac=self.mac,
)
return aes_cbc_decrypt(key, iv, encrypted_data).decode("utf-8")
def to_dict(self, data: str) -> Dict[str, str]:
"""Encrypts data in dict style.
The output dict contains the base64 encoded (packed) salt, iv and
ciphertext key/value pairs and an info key/value pair with additional
encryption information.
Args:
data: The data to encrypt.
Returns:
The encrypted data in dict style.
"""
salt, iv, encrypted_data = self._encrypt(data)
return {
"salt": base64.b64encode(salt).decode("utf-8"),
"iv": base64.b64encode(iv).decode("utf-8"),
"ciphertext": base64.b64encode(encrypted_data).decode("utf-8"),
"info": "base64-encoded AES-CBC-256 of JSON object",
}
def from_dict(self, data: dict) -> str:
"""Decrypts data previously encrypted with :meth:`AESCipher.to_dict`.
Args:
data: The encrypted data in json style.
Returns:
The decrypted data.
"""
salt = base64.b64decode(data["salt"])
iv = base64.b64decode(data["iv"])
encrypted_data = base64.b64decode(data["ciphertext"])
return self._decrypt(salt, iv, encrypted_data)
def to_bytes(self, data: str) -> bytes:
"""Encrypts data in bytes style.
The output bytes contains the (packed) salt, iv and ciphertext.
Args:
data: The data to encrypt.
Returns:
The encrypted data in dict style.
"""
salt, iv, encrypted_data = self._encrypt(data)
return salt + iv + encrypted_data
def from_bytes(self, data: bytes) -> str:
"""Decrypts data previously encrypted with :meth:`AESCipher.to_bytes`.
Args:
data: The encrypted data in bytes style.
Returns:
The decrypted data.
"""
bs = BLOCK_SIZE
salt = data[:bs]
iv = data[bs : 2 * bs]
encrypted_data = data[2 * bs :]
return self._decrypt(salt, iv, encrypted_data)
def to_file(
self,
data: str,
filename: pathlib.Path,
encryption: str = "json",
indent: int = 4,
) -> None:
"""Encrypts and saves data to given file.
Args:
data: The data to encrypt.
filename: The name of the file to save the data to.
encryption: The encryption style to use. Can be ``json`` or
``bytes`` (Default: json).
indent: The indention level when saving in json style
(Default: 4).
Raises:
ValueError: If `encryption` is not ``json`` or ``bytes``.
"""
if encryption == "json":
encrypted_dict = self.to_dict(data)
data_json = json.dumps(encrypted_dict, indent=indent)
filename.write_text(data_json)
elif encryption == "bytes":
encrypted_data = self.to_bytes(data)
filename.write_bytes(encrypted_data)
else:
raise ValueError('encryption must be "json" or "bytes"..')
def from_file(self, filename: pathlib.Path, encryption: str = "json") -> str:
"""Loads and decrypts data from given file.
Args:
filename: The name of the file to load the data from.
encryption: The encryption style which where used. Can be ``json``
or ``bytes`` (Default: json).
Returns:
The decrypted data.
Raises:
ValueError: If `encryption` is not ``json`` or ``bytes``.
"""
if encryption == "json":
encrypted_json = filename.read_text()
encrypted_dict = json.loads(encrypted_json)
return self.from_dict(encrypted_dict)
elif encryption == "bytes":
encrypted_data = filename.read_bytes()
return self.from_bytes(encrypted_data)
else:
raise ValueError('encryption must be "json" or "bytes".')
def detect_file_encryption(filename: pathlib.Path) -> Optional[str]:
"""Detect the encryption format from an authentication file.
Args:
filename: The name for the authentication file.
Returns:
``False`` if file is not encrypted otherwise the encryption format.
"""
file = filename.read_bytes()
encryption = None
try:
file = json.loads(file)
if "adp_token" in file:
encryption = False
elif "ciphertext" in file:
encryption = "json"
except UnicodeDecodeError:
encryption = "bytes"
return encryption
def remove_file_encryption(
source: Union[str, pathlib.Path],
target: Union[str, pathlib.Path],
password: str,
**kwargs
) -> None:
"""Removes the encryption from an authentication file.
Please try to load the authentication file with
:meth:`audible.Authenticator.from_file` and save the authentication data
as a unencrypted file first. Use this function as fallback if you ran into
any error.
Args:
source: The encrypted authentication file.
target: The filename for the decrypted file.
password: The password for the encrypted authentication file.
Raises:
ValueError: If ``source`` is not encrypted.
"""
source_file = pathlib.Path(source)
encryption = detect_file_encryption(source_file)
if not encryption:
raise ValueError("file is not encrypted")
crypter = AESCipher(password, **kwargs)
decrypted = crypter.from_file(source_file, encryption=encryption)
pathlib.Path(target).write_text(decrypted)

View File

@@ -8,16 +8,16 @@ __license__ = "GPL v3"
__version__ = "6.0"
import sys, os, re
import getopt
import re
import traceback
import time
import html.entities
import os
import re
import sys
import time
import traceback
import mobidedrm
import kgenpids
import mobidedrm
class DrmException(Exception):
@@ -57,7 +57,7 @@ def unicode_argv():
# Windows, with the underlying Windows API instead replacing multi-byte
# characters with '?'.
from ctypes import POINTER, byref, cdll, c_int, windll
from ctypes import POINTER, byref, c_int, cdll, windll
from ctypes.wintypes import LPCWSTR, LPWSTR
GetCommandLineW = cdll.kernel32.GetCommandLineW

View File

@@ -0,0 +1,444 @@
"""
This code is copied from https://github.com/apprenticeharper/DeDRM_tools and
recode to use amazon.ion instead of the DeDRM BinaryIonParser class. Added
support for converting a metadata file from DRMION format.
"""
import hashlib
import hmac
import os
import shutil
import zipfile
from io import BytesIO
from amazon.ion import simpleion
from amazon.ion.core import IonType
from amazon.ion.symbols import SymbolTableCatalog, shared_symbol_table
from .aescipher import aes_cbc_decrypt
pythonista_lzma = False
import lzma
SYM_NAMES = [
"com.amazon.drm.Envelope@1.0",
"com.amazon.drm.EnvelopeMetadata@1.0",
"size",
"page_size",
"encryption_key",
"encryption_transformation",
"encryption_voucher",
"signing_key",
"signing_algorithm",
"signing_voucher",
"com.amazon.drm.EncryptedPage@1.0",
"cipher_text",
"cipher_iv",
"com.amazon.drm.Signature@1.0",
"data",
"com.amazon.drm.EnvelopeIndexTable@1.0",
"length",
"offset",
"algorithm",
"encoded",
"encryption_algorithm",
"hashing_algorithm",
"expires",
"format",
"id",
"lock_parameters",
"strategy",
"com.amazon.drm.Key@1.0",
"com.amazon.drm.KeySet@1.0",
"com.amazon.drm.PIDv3@1.0",
"com.amazon.drm.PlainTextPage@1.0",
"com.amazon.drm.PlainText@1.0",
"com.amazon.drm.PrivateKey@1.0",
"com.amazon.drm.PublicKey@1.0",
"com.amazon.drm.SecretKey@1.0",
"com.amazon.drm.Voucher@1.0",
"public_key",
"private_key",
"com.amazon.drm.KeyPair@1.0",
"com.amazon.drm.ProtectedData@1.0",
"doctype",
"com.amazon.drm.EnvelopeIndexTableOffset@1.0",
"enddoc",
"license_type",
"license",
"watermark",
"key",
"value",
"com.amazon.drm.License@1.0",
"category",
"metadata",
"categorized_metadata",
"com.amazon.drm.CategorizedMetadata@1.0",
"com.amazon.drm.VoucherEnvelope@1.0",
"mac",
"voucher",
"com.amazon.drm.ProtectedData@2.0",
"com.amazon.drm.Envelope@2.0",
"com.amazon.drm.EnvelopeMetadata@2.0",
"com.amazon.drm.EncryptedPage@2.0",
"com.amazon.drm.PlainText@2.0",
"compression_algorithm",
"com.amazon.drm.Compressed@1.0",
"page_index_table",
"com.amazon.drm.VoucherEnvelope@2.0",
"com.amazon.drm.VoucherEnvelope@3.0",
]
# asserts must always raise exceptions for proper functioning
def _assert(test, msg="Exception"):
if not test:
raise Exception(msg)
def get_ion_parser(ion: bytes, single_value: bool = True, addprottable: bool = False):
catalog = SymbolTableCatalog()
if addprottable:
table = shared_symbol_table("ProtectedData", 1, SYM_NAMES)
catalog.register(table)
return simpleion.loads(ion, catalog=catalog, single_value=single_value)
class DrmIonVoucher:
envelope = None
version = None
voucher = None
drmkey = None
license_type = "Unknown"
encalgorithm = ""
enctransformation = ""
hashalgorithm = ""
lockparams = None
ciphertext = b""
cipheriv = b""
secretkey = b""
def __init__(self, voucherenv, dsn, secret):
self.dsn, self.secret = dsn, secret
self.lockparams = []
self.envelope = get_ion_parser(voucherenv, addprottable=True)
def decrypt_voucher(self):
shared = (
"PIDv3" + self.encalgorithm + self.enctransformation + self.hashalgorithm
)
self.lockparams.sort()
for param in self.lockparams:
if param == "ACCOUNT_SECRET":
shared += param + self.secret
elif param == "CLIENT_ID":
shared += param + self.dsn
else:
_assert(False, "Unknown lock parameter: %s" % param)
sharedsecret = shared.encode("ASCII")
key = hmac.new(sharedsecret, b"PIDv3", digestmod=hashlib.sha256).digest()
b = aes_cbc_decrypt(key[:32], self.cipheriv[:16], self.ciphertext)
self.drmkey = get_ion_parser(b, addprottable=True)
_assert(
len(self.drmkey) > 0
and self.drmkey.ion_type == IonType.LIST
and self.drmkey.ion_annotations[0].text == "com.amazon.drm.KeySet@1.0",
"Expected KeySet, got %s" % self.drmkey.ion_annotations[0].text,
)
for item in self.drmkey:
if item.ion_annotations[0].text != "com.amazon.drm.SecretKey@1.0":
continue
_assert(
item["algorithm"] == "AES",
"Unknown cipher algorithm: %s" % item["algorithm"],
)
_assert(item["format"] == "RAW", "Unknown key format: %s" % item["format"])
self.secretkey = item["encoded"]
def parse(self):
_assert(len(self.envelope) > 0, "Envelope is empty")
_assert(
self.envelope.ion_type == IonType.STRUCT
and self.envelope.ion_annotations[0].text.startswith(
"com.amazon.drm.VoucherEnvelope@"
),
"Unknown type encountered in envelope, expected VoucherEnvelope",
)
self.version = int(self.envelope.ion_annotations[0].text.split("@")[1][:-2])
self.voucher = get_ion_parser(self.envelope["voucher"], addprottable=True)
strategy_annotation_name = self.envelope["strategy"].ion_annotations[0].text
_assert(
strategy_annotation_name == "com.amazon.drm.PIDv3@1.0",
"Unknown strategy: %s" % strategy_annotation_name,
)
strategy = self.envelope["strategy"]
self.encalgorithm = strategy["encryption_algorithm"]
self.enctransformation = strategy["encryption_transformation"]
self.hashalgorithm = strategy["hashing_algorithm"]
lockparams = strategy["lock_parameters"]
_assert(
lockparams.ion_type == IonType.LIST,
"Expected string list for lock_parameters",
)
self.lockparams.extend(lockparams)
self.parse_voucher()
def parse_voucher(self):
_assert(len(self.voucher) > 0, "Voucher is empty")
_assert(
self.voucher.ion_type == IonType.STRUCT
and self.voucher.ion_annotations[0].text == "com.amazon.drm.Voucher@1.0",
"Unknown type, expected Voucher",
)
self.cipheriv = self.voucher["cipher_iv"]
self.ciphertext = self.voucher["cipher_text"]
_assert(
self.voucher["license"].ion_annotations[0].text
== "com.amazon.drm.License@1.0",
"Unknown license: %s" % self.voucher["license"].ion_annotations[0].text,
)
self.license_type = self.voucher["license"]["license_type"]
def get_license_type(self):
return self.license_type
class DrmIon:
ion = None
voucher = None
vouchername = ""
key = b""
onvoucherrequired = None
def __init__(self, ionstream, onvoucherrequired):
self.ion = get_ion_parser(ionstream, addprottable=True, single_value=False)
self.onvoucherrequired = onvoucherrequired
def parse(self, outpages):
_assert(len(self.ion) > 0, "DRMION envelope is empty")
_assert(
self.ion[0].ion_type == IonType.SYMBOL
and self.ion[0].ion_annotations[0].text == "doctype",
"Expected doctype symbol",
)
_assert(
self.ion[1].ion_type == IonType.LIST
and self.ion[1].ion_annotations[0].text
in ["com.amazon.drm.Envelope@1.0", "com.amazon.drm.Envelope@2.0"],
"Unknown type encountered in DRMION envelope, expected Envelope, got %s"
% self.ion[1].ion_annotations[0].text,
)
for ion_list in self.ion:
if not ion_list.ion_annotations[0].text in [
"com.amazon.drm.Envelope@1.0",
"com.amazon.drm.Envelope@2.0",
]:
continue
for item in ion_list:
if item.ion_annotations[0].text in [
"com.amazon.drm.EnvelopeMetadata@1.0",
"com.amazon.drm.EnvelopeMetadata@2.0",
]:
if item.get("encryption_voucher") is None:
continue
if self.vouchername == "":
self.vouchername = item["encryption_voucher"]
self.voucher = self.onvoucherrequired(self.vouchername)
self.key = self.voucher.secretkey
_assert(
self.key is not None,
"Unable to obtain secret key from voucher",
)
else:
_assert(
self.vouchername == item["encryption_voucher"],
"Unexpected: Different vouchers required for same file?",
)
elif item.ion_annotations[0].text in [
"com.amazon.drm.EncryptedPage@1.0",
"com.amazon.drm.EncryptedPage@2.0",
]:
decompress = False
decrypt = True
if item["cipher_text"].ion_annotations:
if (
item["cipher_text"].ion_annotations[0].text
== "com.amazon.drm.Compressed@1.0"
):
decompress = True
ct = item["cipher_text"]
civ = item["cipher_iv"]
if ct is not None and civ is not None:
self.processpage(ct, civ, outpages, decompress, decrypt)
elif item.ion_annotations[0].text in [
"com.amazon.drm.PlainText@1.0",
"com.amazon.drm.PlainText@2.0",
]:
decompress = False
decrypt = False
if (
item["data"].ion_annotations[0].text
== "com.amazon.drm.Compressed@1.0"
):
decompress = True
self.processpage(item["data"], None, outpages, decompress, decrypt)
def processpage(self, ct, civ, outpages, decompress, decrypt):
if decrypt:
msg = aes_cbc_decrypt(self.key[:16], civ[:16], ct)
else:
msg = ct
if not decompress:
outpages.write(msg)
return
_assert(msg[0] == 0, "LZMA UseFilter not supported")
if pythonista_lzma:
segment = lzma.decompress(msg[1:])
msg = b""
outpages.write(segment.getvalue())
return 0
decomp = lzma.LZMADecompressor(format=lzma.FORMAT_ALONE)
while not decomp.eof:
segment = decomp.decompress(msg[1:])
msg = b"" # Contents were internally buffered after the first call
outpages.write(segment)
class KFXZipBook:
def __init__(self, infile, dsn):
self.infile = infile
self.dsn = dsn
self.voucher = None
self.decrypted = {}
def getPIDMetaInfo(self):
return (None, None)
def processBook(self):
with zipfile.ZipFile(self.infile, "r") as zf:
for filename in zf.namelist():
with zf.open(filename) as fh:
data = fh.read(8)
if data != b"\xeaDRMION\xee":
continue
data += fh.read()
if self.voucher is None:
self.decrypt_voucher()
print("Decrypting KFX DRMION: {0}".format(filename))
outfile = BytesIO()
DrmIon(data[8:-8], lambda name: self.voucher).parse(outfile)
outfile = outfile.getvalue()
if len(outfile) > 0:
self.decrypted[filename] = outfile
else:
print(
"Decrypting KFX DRMION {0} results in a length of Zero. Skip file.".format(
filename
)
)
if not self.decrypted:
print("The .kfx-zip archive does not contain an encrypted DRMION file")
def decrypt_voucher(self):
with zipfile.ZipFile(self.infile, "r") as zf:
for info in zf.infolist():
with zf.open(info.filename) as fh:
data = fh.read(4)
if data != b"\xe0\x01\x00\xea":
continue
data += fh.read()
if b"ProtectedData" in data:
break # found DRM voucher
else:
raise Exception(
"The .kfx-zip archive contains an encrypted DRMION file without a DRM voucher"
)
print("Decrypting KFX DRM voucher: {0}".format(info.filename))
for pid in [""] + [self.dsn]:
for dsn_len, secret_len in [
(0, 0),
(16, 0),
(16, 40),
(32, 40),
(40, 0),
(40, 40),
]:
if len(pid) == dsn_len + secret_len:
break # split pid into DSN and account secret
else:
continue
try:
voucher = DrmIonVoucher(data, pid[:dsn_len], pid[dsn_len:])
voucher.parse()
voucher.decrypt_voucher()
break
except:
pass
else:
raise Exception("Failed to decrypt KFX DRM voucher with any key")
print("KFX DRM voucher successfully decrypted")
license_type = voucher.get_license_type()
if license_type != "Purchase":
raise Exception(
(
"This book is licensed as {0}. "
"These tools are intended for use on purchased books."
).format(license_type)
)
self.voucher = voucher
def getBookTitle(self):
return os.path.splitext(os.path.split(self.infile)[1])[0]
def getBookExtension(self):
return ".kfx-zip"
def getBookType(self):
return "KFX-ZIP"
def cleanup(self):
pass
def getFile(self, outpath):
if not self.decrypted:
shutil.copyfile(self.infile, outpath)
else:
with zipfile.ZipFile(self.infile, "r") as zif:
with zipfile.ZipFile(outpath, "w") as zof:
for info in zif.infolist():
zof.writestr(
info,
self.decrypted.get(info.filename, zif.read(info.filename)),
)

View File

@@ -9,9 +9,8 @@ __version__ = "3.0"
import binascii
import hashlib
from struct import pack
import traceback
from struct import pack
global charMap1
global charMap3

View File

@@ -23,8 +23,8 @@ __version__ = "1.0"
# For example, ActiveState Python, which exists for windows.
import struct
import binascii
import struct
class DrmException(Exception):

View File

@@ -0,0 +1,400 @@
import base64
import json
import os
import pathlib
import re
import shutil
import time
from collections import namedtuple
from datetime import datetime
from enum import Enum
from io import BytesIO
from zipfile import ZipFile
import requests
import xmltodict
from amazon.ion import simpleion
from mobi import extract
from rich import print
from kindle_download_helper import amazon_api
from kindle_download_helper.config import (
API_MANIFEST_URL,
DEFAULT_OUT_DEDRM_DIR,
DEFAULT_OUT_DIR,
DEFAULT_OUT_EPUB_DIR,
)
from kindle_download_helper.dedrm import MobiBook, get_pid_list
from kindle_download_helper.dedrm.kfxdedrm import KFXZipBook
from kindle_download_helper.third_party.ion import DrmIon, DrmIonVoucher
from kindle_download_helper.third_party.kfxlib import YJ_Book
DEBUG = False
class Scope(Enum):
REQUIRED = 1
PREFERRED = 2
DEFERRED = 3
def should_download(self, s: str):
r = Scope[s.upper()]
return self.value >= r.value
Request = namedtuple("Request", ["method", "url", "fn", "headers"])
def _build_correlation_id(device, serial, asin, timestamp):
if timestamp is None:
timestamp = datetime.utcnow().timestamp()
timestamp = str(int(timestamp) * 1000)
return f"Device:{device}:{serial};kindle.EBOK:{asin}:{timestamp}"
class NoKindle:
def __init__(
self,
email,
password,
domain,
out_dir=DEFAULT_OUT_DIR,
out_dedrm_dir=DEFAULT_OUT_DEDRM_DIR,
out_epub_dir=DEFAULT_OUT_EPUB_DIR,
cut_length=100,
):
self.out_dir = out_dir
self.out_dedrm_dir = out_dedrm_dir
self.out_epub_dir = out_epub_dir
self.session = requests.Session()
self.ebooks = []
self.pdocs = []
self.library_dict = {}
print("Authenticating . . .")
self.tokens = amazon_api.login(email, password, domain)
def decrypt_voucher(self, voucher_data):
with BytesIO(voucher_data) as voucher_data_io:
for pid in [""] + [self.tokens["device_id"]]:
for dsn_len, secret_len in [
(0, 0),
(16, 0),
(16, 40),
(32, 40),
(40, 0),
(40, 40),
]:
if len(pid) == dsn_len + secret_len:
break # split pid into DSN and account secret
else:
continue
voucher = DrmIonVoucher(voucher_data_io, pid[:dsn_len], pid[dsn_len:])
voucher.parse()
voucher.decryptvoucher()
return voucher
def decrypt_kfx(self, kfx_data):
if kfx_data[:8] != b"\xeaDRMION\xee":
return kfx_data
with BytesIO() as decrypted_data:
DrmIon(BytesIO(kfx_data[8:-8]), lambda name: self.drm_voucher).parse(
decrypted_data
)
return decrypted_data.getvalue()
def get_resource(self, resource, asin):
resp = self.session.send(
amazon_api.signed_request(
"GET",
resource["endpoint"]["url"],
asin=asin,
tokens=self.tokens,
request_id=resource["id"],
request_type=resource["type"],
)
)
filename = resource["id"]
if resource["type"] == "DRM_VOUCHER":
filename += ".ast"
else:
filename += ".kfx"
return (resp.content, filename)
def make_library(self, last_sync=None):
"""Fetches the user library."""
url = "https://todo-ta-g7g.amazon.com/FionaTodoListProxy/syncMetaData"
params = {"item_count": 10000}
if isinstance(last_sync, dict):
try:
last_sync = last_sync["sync_time"]
except KeyError as exc:
raise ValueError("`last_sync` doesn't contain `sync_time`.") from exc
if last_sync is not None:
params["last_sync_time"] = last_sync
r = self.session.send(
amazon_api.signed_request(
"GET",
url,
tokens=self.tokens,
)
)
library = xmltodict.parse(r.text)
library = json.loads(json.dumps(library))
library = library["response"]["add_update_list"]
ebooks = [i for i in library["meta_data"] if i["cde_contenttype"] == "EBOK"]
pdocs = [i for i in library["meta_data"] if i["cde_contenttype"] == "PDOC"]
ebooks = [e for e in ebooks if e["origins"]["origin"]["type"] == "Purchase"]
unknow_index = 1
for i in pdocs + ebooks:
if isinstance(i["title"], dict):
if i["ASIN"] in self.library_dict:
unknow_index += 1
self.library_dict[i["ASIN"]] = i["title"].get(
"#text", str(unknow_index)
)
else:
self.library_dict[i["ASIN"]] = i["title"]
self.ebooks = ebooks
self.pdocs = pdocs
def sidecar_ebook(self, asin):
url = f"https://sars.amazon.com/sidecar/sa/EBOK/{asin}"
r = self.session.send(
amazon_api.signed_request(
"GET",
url,
tokens=self.tokens,
)
)
print(r.json())
@staticmethod
def _b64ion_to_dict(b64ion: str):
ion = base64.b64decode(b64ion)
ion = simpleion.loads(ion)
return dict(ion)
def get_book(self, asin):
manifest_resp = self.session.send(
amazon_api.signed_request(
"GET",
API_MANIFEST_URL + asin.upper(),
asin=asin,
tokens=self.tokens,
request_type="manifest",
)
)
try:
resources = manifest_resp.json()["resources"]
except Exception as e:
print(manifest_resp.json(), str(e))
return None, False, str(e)
manifest = manifest_resp.json()
# azw3 is not so hard
drm_voucher_list = [
resource for resource in resources if resource["type"] == "DRM_VOUCHER"
]
if not drm_voucher_list:
return manifest, False, "Succeed"
drm_voucher = drm_voucher_list[0]
try:
self.drm_voucher = self.decrypt_voucher(
self.get_resource(drm_voucher, asin)[0]
)
except:
print("Could not decrypt the drm voucher!")
manifest["responseContext"] = self._b64ion_to_dict(manifest["responseContext"])
for resource in manifest["resources"]:
if "responseContext" in resource:
resource["responseContext"] = self._b64ion_to_dict(
resource["responseContext"]
)
return manifest, True, "Succeed"
def download_book(self, asin, error=None):
manifest, is_kfx, info = self.get_book(asin)
if not manifest:
print(f"Error to download ASIN: {asin}, error: {str(info)}")
return
if is_kfx:
self._download_kfx(manifest, asin)
else:
self._download_azw(manifest, asin)
def _download_kfx(self, manifest, asin):
resources = manifest["resources"]
parts = []
scope = Scope.DEFERRED
if isinstance(scope, str):
try:
scope = Scope[scope.upper()]
except KeyError:
allowed_scopes = [s.name.lower() for s in Scope]
raise ValueError(
"Scope must be in %s, got %s" % (", ".join(allowed_scopes), scope)
)
for resource in resources:
if not scope.should_download(resource["requirement"]):
continue
try:
url = (
resource.get("optimalEndpoint", {}).get("directUrl")
or resource.get("endpoint")["url"]
)
except KeyError:
raise RuntimeError("No url found for item with id %s." % resource["id"])
headers = {}
fn = None
if resource["type"] == "DRM_VOUCHER":
fn = resource["id"] + ".voucher"
correlation_id = _build_correlation_id(
"A2A33MVZVPQKHY",
self.tokens["device_id"],
asin=manifest["content"]["id"],
timestamp=manifest["responseContext"]["manifestTime"],
)
headers = {
"User-Agent": "Kindle/1.0.235280.0.10 CFNetwork/1220.1 Darwin/20.3.0",
"X-ADP-AttemptCount": "1",
"X-ADP-CorrelationId": correlation_id,
"X-ADP-Transport": str(manifest["responseContext"]["transport"]),
"X-ADP-Reason": str(manifest["responseContext"]["reason"]),
"x-amzn-accept-type": "application/x.amzn.digital.deliverymanifest@1.0",
"X-ADP-SW": str(manifest["responseContext"]["swVersion"]),
"X-ADP-LTO": "60",
"Accept": "application/x-com.amazon.drm.Voucher@1.0",
}
if "country" in manifest["responseContext"]:
headers["X-ADP-Country"] = str(
manifest["responseContext"]["country"]
)
url += "&supportedVoucherVersions=V1"
elif resource["type"] == "KINDLE_MAIN_BASE":
fn = manifest["content"]["id"] + "_EBOK.azw"
elif resource["type"] == "KINDLE_MAIN_METADATA":
fn = resource["id"] + ".azw.md"
elif resource["type"] == "KINDLE_MAIN_ATTACHABLE":
fn = resource["id"] + ".azw.res"
elif resource["type"] == "KINDLE_USER_ANOT":
fn = manifest["content"]["id"] + "_EBOK.mbpV2"
parts.append(Request(method="GET", url=url, fn=fn, headers=headers))
files = []
for part in parts:
r = self.session.send(
amazon_api.signed_request(
part.method,
part.url,
asin=asin,
tokens=self.tokens,
headers=part.headers,
)
)
fn = part.fn
if fn is None:
cd = r.headers.get("content-disposition")
fn = re.findall('filename="(.+)"', cd)
fn = fn[0]
fn = os.path.join(self.out_dir, fn)
files.append(fn)
fn.write_bytes(r.content)
print(f"Book part successfully saved to {fn}")
asin = manifest["content"]["id"].upper()
manifest_file = pathlib.Path(f"{asin}.manifest")
manifest_json_data = json.dumps(manifest)
manifest_file.write_text(manifest_json_data)
files.append(manifest_file)
name = self.library_dict.get(asin)
if len(name) > self.cut_length:
name = name[: self.cut_length - 10]
fn = name + "_" + asin + "_EBOK.kfx-zip"
fn = os.path.join(self.out_dir, fn)
out_epub = os.path.join(self.out_epub_dir, name.split(".")[0] + ".epub")
with ZipFile(fn, "w") as myzip:
for file in files:
myzip.write(file)
file.unlink()
fn_dec = name + "_" + asin + "_EBOK.kfx-zip.tmp"
fn_dec = os.path.join(name + "_" + asin + "_EBOK.kfx-zip.tmp")
kfx_book = KFXZipBook(fn, self.tokens["device_id"])
kfx_book.voucher = self.drm_voucher
kfx_book.processBook()
kfx_book.getFile(fn_dec)
pathlib.Path(fn).unlink()
pathlib.Path(fn_dec).rename(fn)
b = YJ_Book(fn)
epub_data = b.convert_to_epub()
with open(out_epub, "wb") as f:
f.write(epub_data)
def _download_azw(self, manifest, asin):
resources = manifest["resources"]
url = resources[0]["endpoint"]["url"]
r = self.session.send(
amazon_api.signed_request(
"GET",
url,
asin=asin,
tokens=self.tokens,
)
)
name = self.library_dict.get(asin)
if len(name) > self.cut_length:
name = name[: self.cut_length - 10]
out = os.path.join(self.out_dir, name + ".azw3")
out_epub = os.path.join(self.out_epub_dir, name + ".epub")
with open(out, "wb") as f:
for chunk in r.iter_content(chunk_size=512):
f.write(chunk)
out_dedrm = os.path.join(self.out_dedrm_dir, name)
time.sleep(1)
mb = MobiBook(out)
md1, md2 = mb.get_pid_meta_info()
totalpids = get_pid_list(md1, md2, [self.device_serial_number], [])
totalpids = list(set(totalpids))
mb.make_drm_file(totalpids, out_dedrm)
time.sleep(1)
# save to EPUB
epub_dir, epub_file = extract(out_dedrm)
print(epub_file)
shutil.copy2(epub_file, out_epub)
# delete it
shutil.rmtree(epub_dir)
if __name__ == "__main__":
kindle = NoKindle()
kindle.make_library()
for e in kindle.ebooks:
try:
# if e['ASIN'] == 'B01C7CFR5G':
if 1:
kindle.download_book(e["ASIN"])
else:
print(f"Pass: {e['ASIN']}")
except Exception as e:
import traceback
traceback.print_exc()
print(e)
# spider rule
time.sleep(1)

1259
kindle_download_helper/third_party/ion.py vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,36 @@
#! /usr/bin/python3
from __future__ import absolute_import, division, print_function, unicode_literals
from . import message_logging, utilities, yj_book, yj_metadata
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
set_logger = message_logging.set_logger
YJ_Book = yj_book.YJ_Book
YJ_Metadata = yj_metadata.YJ_Metadata
KFXDRMError = utilities.KFXDRMError
clean_message = utilities.clean_message
file_read_binary = utilities.file_read_binary
file_write_binary = utilities.file_write_binary
file_read_utf8 = utilities.file_read_utf8
file_write_utf8 = utilities.file_write_utf8
json_deserialize = utilities.json_deserialize
json_serialize = utilities.json_serialize
unicode_argv = utilities.unicode_argv
windows_long_path_fix = utilities.windows_long_path_fix
IS_LINUX = utilities.IS_LINUX
IS_MACOS = utilities.IS_MACOS
IS_WINDOWS = utilities.IS_WINDOWS
user_home_dir = utilities.user_home_dir
windows_user_dir = utilities.windows_user_dir
locale_encode = utilities.locale_encode
locale_decode = utilities.locale_decode
os_environ_get = utilities.os_environ_get

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,439 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import datetime
import decimal
import math
import re
from .message_logging import log
from .python_transition import IS_PYTHON2, bytes_to_hex
from .utilities import sha1, type_name
if IS_PYTHON2:
from .python_transition import repr, str
else:
long = int
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
LARGE_DATA_SIZE = 256
MAX_ASCII_DATA_SIZE = 10000
IonBool = bool
IonDecimal = decimal.Decimal
IonFloat = float
IonInt = int
IonList = list
IonNull = type(None)
IonString = str
def ion_type(value):
t = type(value)
if t in ION_TYPES:
return t
if isinstance(value, IonAnnotation):
return IonAnnotation
if isinstance(value, IonList) and not isinstance(value, IonSExp):
return IonList
if isinstance(value, long):
return IonInt
raise Exception("Data has non-Ion type %s: %s" % (type_name(value), repr(value)))
def isstring(value):
return isinstance(value, str) and not isinstance(value, IonSymbol)
class IonAnnotation(object):
def __init__(self, annotations, value):
self.annotations = (
annotations
if isinstance(annotations, IonAnnots)
else IonAnnots(annotations)
)
if isinstance(value, IonAnnotation):
raise Exception("IonAnnotation cannot be annotated")
self.value = value
def __repr__(self):
return "%s %s" % (repr(self.annotations), repr(self.value))
def __str__(self):
return repr(self.annotations)
def is_single(self):
return len(self.annotations) == 1
def has_annotation(self, annotation):
return annotation in self.annotations
def is_annotation(self, annotation):
return self.is_single() and self.annotations[0] == annotation
def get_annotation(self):
if not self.is_single():
raise Exception(
"get_annotation expected single annotation, found %s"
% repr(self.annotations)
)
return self.annotations[0]
def verify_annotation(self, annotation):
if not self.is_annotation(annotation):
raise Exception(
"Expected annotation %s, found %s"
% (repr(annotation), repr(self.annotations))
)
return self
class IonAnnots(tuple):
def __new__(cls, annotations):
annots = tuple.__new__(cls, annotations)
if len(annots) == 0:
raise Exception("IonAnnotation cannot be empty")
for a in annots:
if not isinstance(a, IonSymbol):
raise Exception("IonAnnotation must be IonSymbol: %s" % repr(a))
return annots
def __repr__(self):
return " ".join(["%s::" % repr(a) for a in self])
class IonBLOB(bytes):
def __eq__(self, other):
if other is None:
return False
if not isinstance(other, (IonBLOB, bytes)):
raise Exception("IonBLOB __eq__: comparing with %s" % type_name(other))
return bytes(self) == bytes(other)
def __ne__(self, other):
return not self.__eq__(other)
def __lt__(self, other):
raise Exception("IonBLOB __lt__ not implemented")
def __le__(self, other):
raise Exception("IonBLOB __le__ not implemented")
def __gt__(self, other):
raise Exception("IonBLOB __gt__ not implemented")
def __ge__(self, other):
raise Exception("IonBLOB __ge__ not implemented")
def __repr__(self):
return "*** %d byte BLOB %s ***" % (len(self), bytes_to_hex(sha1(self)))
def ascii_data(self):
if len(self) > MAX_ASCII_DATA_SIZE:
return None
try:
data = self.decode("ascii")
except UnicodeDecodeError:
return None
for c in data:
o = ord(c)
if (o < 32 and o not in [9, 10, 13]) or o >= 127:
return None
return data
def is_large(self):
return len(self) >= LARGE_DATA_SIZE and self.ascii_data() is None
def tobytes(self):
return bytes(self)
class IonCLOB(bytes):
def tobytes(self):
return bytes(self)
class IonNop(object):
pass
class IonSExp(list):
def __repr__(self):
return "(%s)" % (", ".join([repr(v) for v in self]))
def tolist(self):
return list(self)
class IonStruct(collections.OrderedDict):
def __init__(self, *args):
if len(args) == 1:
collections.OrderedDict.__init__(self, args[0])
return
collections.OrderedDict.__init__(self)
if len(args) % 2 != 0:
raise Exception("IonStruct created with %d arguments" % len(args))
for i in range(0, len(args), 2):
self[args[i]] = args[i + 1]
def __repr__(self):
return "{%s}" % (
", ".join(["%s: %s" % (repr(k), repr(v)) for k, v in self.items()])
)
def todict(self):
return collections.OrderedDict(self)
class IonSymbol(str):
def __repr__(self):
if re.match(r"^[\u0021-\u007e]+$", self):
return str(self)
return "`%s`" % self
def tostring(self):
return str(self)
IS = IonSymbol
class IonTimestamp(datetime.datetime):
def __repr__(self):
value = self
if isinstance(value.tzinfo, IonTimestampTZ):
format = value.tzinfo.format()
format = format.replace(
"%f", ("%06d" % value.microsecond)[: value.tzinfo.fraction_len()]
)
if value.year < 1900:
format = format.replace("%Y", "%04d" % value.year)
value = value.replace(year=1900)
return value.strftime(format) + (
value.tzname() if value.tzinfo.present() else ""
)
return value.isoformat()
ION_TIMESTAMP_Y = "%YT"
ION_TIMESTAMP_YM = "%Y-%mT"
ION_TIMESTAMP_YMD = "%Y-%m-%d"
ION_TIMESTAMP_YMDHM = "%Y-%m-%dT%H:%M"
ION_TIMESTAMP_YMDHMS = "%Y-%m-%dT%H:%M:%S"
ION_TIMESTAMP_YMDHMSF = "%Y-%m-%dT%H:%M:%S.%f"
class IonTimestampTZ(datetime.tzinfo):
def __init__(self, offset, format, fraction_len):
datetime.tzinfo.__init__(self)
self.__offset = offset
self.__format = format
self.__fraction_len = fraction_len
self.__present = format in {
ION_TIMESTAMP_YMDHM,
ION_TIMESTAMP_YMDHMS,
ION_TIMESTAMP_YMDHMSF,
}
if offset and not self.__present:
raise Exception(
"IonTimestampTZ has offset %d with non-present format" % offset
)
if offset and (offset < -1439 or offset > 1439):
raise Exception("IonTimestampTZ has invalid offset %d" % offset)
if fraction_len < 0 or fraction_len > 6:
raise Exception("IonTimestampTZ has invalid fraction len %d" % fraction_len)
if fraction_len and format != ION_TIMESTAMP_YMDHMSF:
raise Exception(
"IonTimestampTZ has fraction len %d without fraction in format"
% fraction_len
)
def utcoffset(self, dt):
return datetime.timedelta(minutes=(self.__offset or 0))
def tzname(self, dt):
if self.__offset is None:
name = "-00:00"
elif self.__offset == 0:
name = "Z"
else:
name = "%s%02d:%02d" % (
"+" if self.__offset >= 0 else "-",
abs(self.__offset) // 60,
abs(self.__offset) % 60,
)
return name.encode("ascii") if IS_PYTHON2 else name
def dst(self, dt):
return datetime.timedelta(0)
def offset_minutes(self):
return self.__offset
def format(self):
return self.__format
def present(self):
return self.__present
def fraction_len(self):
return self.__fraction_len
def __eq__(self, other):
if not isinstance(other, IonTimestampTZ):
raise Exception(
"IonTimestampTZ __eq__: comparing with %s" % type_name(other)
)
return (self.__offset, self.__format, self.__fraction_len) == (
other.__offset,
other.__format,
other.__fraction_len,
)
def __ne__(self, other):
return not self.__eq__(other)
def __copy__(self):
return self
def __deepcopy__(self, memo):
return self
ION_TYPES = {
IonAnnotation,
IonBool,
IonBLOB,
IonCLOB,
IonDecimal,
IonFloat,
IonInt,
IonList,
IonNull,
IonSExp,
IonString,
IonStruct,
IonSymbol,
IonTimestamp,
}
def unannotated(value):
return value.value if isinstance(value, IonAnnotation) else value
def ion_data_eq(f1, f2, msg="Ion data mismatch", report_errors=True):
def ion_data_eq_(f1, f2, ctx):
data_type = ion_type(f1)
if ion_type(f2) is not data_type:
ctx.append("type mismatch: %s != %s" % (type_name(f1), type_name(f2)))
return False
if data_type is IonAnnotation:
if not ion_data_eq_(IonList(f1.annotations), IonList(f2.annotations), ctx):
ctx.append("IonAnnotation")
return False
if not ion_data_eq_(f1.value, f2.value, ctx):
ctx.append("in IonAnnotation %s" % repr(f1))
return False
return True
if data_type in [IonList, IonSExp]:
if len(f1) != len(f2):
ctx.append("%s length %d != %d" % (type_name(f1), len(f1), len(f2)))
return False
for i, (d1, d2) in enumerate(zip(f1, f2)):
if not ion_data_eq_(d1, d2, ctx):
ctx.append("at %s index %d" % (type_name(f1), i))
return False
return True
if data_type is IonStruct:
if len(f1) != len(f2):
ctx.append("IonStruct length %d != %d" % (len(f1), len(f2)))
return False
for f1k, f1v in f1.items():
if f1k not in f2:
ctx.append("IonStruct key %s missing" % f1k)
return False
if not ion_data_eq_(f1v, f2[f1k], ctx):
ctx.append("at IonStruct key %s" % f1k)
return False
return True
if data_type is IonFloat and math.isnan(f1) and math.isnan(f2):
return True
if f1 != f2 or repr(f1) != repr(f2):
ctx.append("value %s != %s" % (repr(f1), repr(f2)))
return False
return True
ctx = []
success = ion_data_eq_(f1, f2, ctx)
if report_errors and not success:
log.error("%s: %s" % (msg, ", ".join(ctx[::-1])))
return success
def filtered_IonList(ion_list, omit_large_blobs=False):
if not omit_large_blobs:
return ion_list
filtered = []
for val in ion_list[:]:
if (
ion_type(val) is IonAnnotation
and ion_type(val.value) is IonBLOB
and val.value.is_large()
):
val = IonAnnotation(val.annotations, repr(val.value))
filtered.append(val)
return filtered

View File

@@ -0,0 +1,734 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import decimal
import struct
from .ion import (
ION_TIMESTAMP_Y,
ION_TIMESTAMP_YM,
ION_TIMESTAMP_YMD,
ION_TIMESTAMP_YMDHM,
ION_TIMESTAMP_YMDHMS,
ION_TIMESTAMP_YMDHMSF,
IonAnnotation,
IonBLOB,
IonBool,
IonCLOB,
IonDecimal,
IonFloat,
IonInt,
IonList,
IonNop,
IonNull,
IonSExp,
IonString,
IonStruct,
IonSymbol,
IonTimestamp,
IonTimestampTZ,
ion_type,
)
from .ion_text import IonSerial
from .message_logging import log
from .python_transition import IS_PYTHON2, bytes_, bytes_indexed
from .utilities import Deserializer, Serializer, bytes_to_separated_hex
if IS_PYTHON2:
from .python_transition import repr
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
DEBUG = False
class IonBinary(IonSerial):
MAJOR_VERSION = 1
MINOR_VERSION = 0
VERSION_MARKER = 0xE0
SIGNATURE = bytes_([VERSION_MARKER, MAJOR_VERSION, MINOR_VERSION, 0xEA])
def deserialize_multiple_values(
self, data, import_symbols=False, with_offsets=False
):
values = self.deserialize_multiple_values_(data, import_symbols, with_offsets)
return values
SORTED_STRUCT_FLAG = 1
VARIABLE_LEN_FLAG = 14
NULL_FLAG = 15
def serialize_multiple_values_(self, values):
serial = Serializer()
serial.append(IonBinary.SIGNATURE)
for value in values:
serial.append(self.serialize_value(value))
return serial.serialize()
def deserialize_multiple_values_(self, data, import_symbols, with_offsets):
if DEBUG:
log.debug("decoding: %s" % bytes_to_separated_hex(data[:1000]))
serial = Deserializer(data)
self.import_symbols = import_symbols
ion_signature = serial.extract(4)
if ion_signature != IonBinary.SIGNATURE:
raise Exception(
"Ion signature is incorrect (%s)"
% bytes_to_separated_hex(ion_signature)
)
result = []
while len(serial):
if serial.extract(1, advance=False) == IonBinary.VERSION_MARKER:
ion_signature = serial.unpack("4s")
if ion_signature != IonBinary.SIGNATURE:
raise Exception(
"Embedded Ion signature is incorrect (%s)"
% bytes_to_separated_hex(ion_signature)
)
else:
value_offset = serial.offset
value = self.deserialize_value(serial)
if self.import_symbols and isinstance(value, IonAnnotation):
if value.is_annotation("$ion_symbol_table"):
self.symtab.create(value.value)
elif value.is_annotation("$ion_shared_symbol_table"):
self.symtab.catalog.create_shared_symbol_table(value.value)
if not isinstance(value, IonNop):
result.append(
[value_offset, serial.offset - value_offset, value]
if with_offsets
else value
)
return result
def serialize_value(self, value):
handler = IonBinary.ION_TYPE_HANDLERS[ion_type(value)]
signature, data = handler(self, value)
if signature is None:
return data
length = len(data)
if length < IonBinary.VARIABLE_LEN_FLAG:
return descriptor(signature, length) + data
return (
descriptor(signature, IonBinary.VARIABLE_LEN_FLAG)
+ serialize_vluint(length)
+ data
)
def deserialize_value(self, serial):
descriptor = serial.unpack("B")
if descriptor == IonBinary.VERSION_MARKER:
raise Exception("Unexpected Ion version marker within data stream")
signature = descriptor >> 4
flag = descriptor & 0x0F
if DEBUG:
log.debug(
"IonBinary 0x%02x: signature=%d flag=%d data=%s"
% (
descriptor,
signature,
flag,
bytes_to_separated_hex(serial.extract(advance=False)[:16]),
)
)
extract_data, deserializer, name = IonBinary.VALUE_DESERIALIZERS[signature]
if flag == IonBinary.NULL_FLAG and signature != IonBinary.NULL_VALUE_SIGNATURE:
log.error("IonBinary: Deserialized null of type %s" % name)
extract_data, deserializer, name = IonBinary.VALUE_DESERIALIZERS[
IonBinary.NULL_VALUE_SIGNATURE
]
if extract_data:
length = (
deserialize_vluint(serial)
if flag == IonBinary.VARIABLE_LEN_FLAG
else flag
)
return deserializer(self, serial.extract(length))
return deserializer(self, flag, serial)
NULL_VALUE_SIGNATURE = 0
def serialize_null_value(self, value):
return (None, descriptor(IonBinary.NULL_VALUE_SIGNATURE, IonBinary.NULL_FLAG))
def deserialize_null_value(self, flag, serial):
if flag == IonBinary.NULL_FLAG:
return None
length = (
deserialize_vluint(serial) if flag == IonBinary.VARIABLE_LEN_FLAG else flag
)
serial.extract(length)
return IonNop()
BOOL_VALUE_SIGNATURE = 1
def serialize_bool_value(self, value):
return (None, descriptor(IonBinary.BOOL_VALUE_SIGNATURE, 1 if value else 0))
def deserialize_bool_value(self, flag, serial):
if flag > 1:
raise Exception("BinaryIonBool: Unknown IonBool flag value: %d" % flag)
return flag != 0
def serialize_int_value(self, value):
return (
(IonBinary.POSINT_VALUE_SIGNATURE, serialize_unsignedint(value))
if value >= 0
else (IonBinary.NEGINT_VALUE_SIGNATURE, serialize_unsignedint(-value))
)
POSINT_VALUE_SIGNATURE = 2
def deserialize_posint_value(self, data):
return deserialize_unsignedint(data)
NEGINT_VALUE_SIGNATURE = 3
def deserialize_negint_value(self, data):
if len(data) == 0:
log.error("BinaryIonNegInt has no data")
if bytes_indexed(data, 0) == 0:
log.error(
"BinaryIonNegInt data starts with 0x00: %s"
% bytes_to_separated_hex(data)
)
return -deserialize_unsignedint(data)
FLOAT_VALUE_SIGNATURE = 4
def serialize_float_value(self, value):
return (
IonBinary.FLOAT_VALUE_SIGNATURE,
b"" if value == 0.0 else struct.pack(">d", value),
)
def deserialize_float_value(self, data):
if len(data) == 0:
return float(0.0)
if len(data) == 4:
return struct.unpack_from(">f", data)[0]
if len(data) == 8:
return struct.unpack_from(">d", data)[0]
raise Exception(
"IonFloat unexpected data length: %s" % bytes_to_separated_hex(data)
)
DECIMAL_VALUE_SIGNATURE = 5
def serialize_decimal_value(self, value):
if value.is_zero():
return (IonBinary.DECIMAL_VALUE_SIGNATURE, b"")
vt = value.as_tuple()
return (
IonBinary.DECIMAL_VALUE_SIGNATURE,
serialize_vlsint(vt.exponent)
+ serialize_signedint(combine_decimal_digits(vt.digits, vt.sign)),
)
def deserialize_decimal_value(self, data):
if len(data) == 0:
return decimal.Decimal(0)
serial = Deserializer(data)
exponent = deserialize_vlsint(serial)
magnitude = deserialize_signedint(serial.extract())
return decimal.Decimal(magnitude) * (decimal.Decimal(10) ** exponent)
TIMESTAMP_VALUE_SIGNATURE = 6
def serialize_timestamp_value(self, value):
serial = Serializer()
if isinstance(value.tzinfo, IonTimestampTZ):
offset_minutes = value.tzinfo.offset_minutes()
format_len = len(value.tzinfo.format())
fraction_exponent = -value.tzinfo.fraction_len()
else:
offset_minutes = (
int(value.utcoffset().total_seconds()) // 60
if value.utcoffset() is not None
else None
)
format_len = len(ION_TIMESTAMP_YMDHMSF)
fraction_exponent = -3
serial.append(serialize_vlsint(offset_minutes))
serial.append(serialize_vluint(value.year))
if format_len >= len(ION_TIMESTAMP_YM):
serial.append(serialize_vluint(value.month))
if format_len >= len(ION_TIMESTAMP_YMD):
serial.append(serialize_vluint(value.day))
if format_len >= len(ION_TIMESTAMP_YMDHM):
serial.append(serialize_vluint(value.hour))
serial.append(serialize_vluint(value.minute))
if format_len >= len(ION_TIMESTAMP_YMDHMS):
serial.append(serialize_vluint(value.second))
if format_len >= len(ION_TIMESTAMP_YMDHMSF):
serial.append(serialize_vlsint(fraction_exponent))
serial.append(
serialize_signedint(
(value.microsecond * int(10**-fraction_exponent))
// 1000000
)
)
return (IonBinary.TIMESTAMP_VALUE_SIGNATURE, serial.serialize())
def deserialize_timestamp_value(self, data):
serial = Deserializer(data)
offset_minutes = deserialize_vlsint(serial, allow_minus_zero=True)
year = deserialize_vluint(serial)
month = deserialize_vluint(serial) if len(serial) > 0 else None
day = deserialize_vluint(serial) if len(serial) > 0 else None
hour = deserialize_vluint(serial) if len(serial) > 0 else None
minute = deserialize_vluint(serial) if len(serial) > 0 else None
second = deserialize_vluint(serial) if len(serial) > 0 else None
if len(serial) > 0:
fraction_exponent = deserialize_vlsint(serial)
fraction_coefficient = (
deserialize_signedint(serial.extract()) if len(serial) > 0 else 0
)
if fraction_coefficient == 0 and fraction_exponent > -1:
microsecond = None
else:
if fraction_exponent < -6 or fraction_exponent > -1:
log.error(
"Unexpected IonTimestamp fraction exponent %d coefficient %d: %s"
% (
fraction_exponent,
fraction_coefficient,
bytes_to_separated_hex(data),
)
)
microsecond = (fraction_coefficient * 1000000) // int(
10**-fraction_exponent
)
if microsecond < 0 or microsecond > 999999:
log.error(
"Incorrect IonTimestamp fraction %d usec: %s"
% (microsecond, bytes_to_separated_hex(data))
)
microsecond = None
fraction_exponent = 0
else:
microsecond = None
fraction_exponent = 0
if month is None:
format = ION_TIMESTAMP_Y
offset_minutes = None
elif day is None:
format = ION_TIMESTAMP_YM
offset_minutes = None
elif hour is None:
format = ION_TIMESTAMP_YMD
offset_minutes = None
elif second is None:
format = ION_TIMESTAMP_YMDHM
elif microsecond is None:
format = ION_TIMESTAMP_YMDHMS
else:
format = ION_TIMESTAMP_YMDHMSF
return IonTimestamp(
year,
month if month is not None else 1,
day if day is not None else 1,
hour if hour is not None else 0,
minute if hour is not None else 0,
second if second is not None else 0,
microsecond if microsecond is not None else 0,
IonTimestampTZ(offset_minutes, format, -fraction_exponent),
)
SYMBOL_VALUE_SIGNATURE = 7
def serialize_symbol_value(self, value):
symbol_id = self.symtab.get_id(value)
if not symbol_id:
raise Exception("attempt to serialize undefined symbol %s" % repr(value))
return (IonBinary.SYMBOL_VALUE_SIGNATURE, serialize_unsignedint(symbol_id))
def deserialize_symbol_value(self, data):
return self.symtab.get_symbol(deserialize_unsignedint(data))
STRING_VALUE_SIGNATURE = 8
def serialize_string_value(self, value):
return (IonBinary.STRING_VALUE_SIGNATURE, value.encode("utf-8"))
def deserialize_string_value(self, data):
return data.decode("utf-8")
CLOB_VALUE_SIGNATURE = 9
def serialize_clob_value(self, value):
log.error("Serialize CLOB")
return (IonBinary.CLOB_VALUE_SIGNATURE, bytes(value))
def deserialize_clob_value(self, data):
log.error("Deserialize CLOB")
return IonCLOB(data)
BLOB_VALUE_SIGNATURE = 10
def serialize_blob_value(self, value):
return (IonBinary.BLOB_VALUE_SIGNATURE, bytes(value))
def deserialize_blob_value(self, data):
return IonBLOB(data)
LIST_VALUE_SIGNATURE = 11
def serialize_list_value(self, value):
serial = Serializer()
for val in value:
serial.append(self.serialize_value(val))
return (IonBinary.LIST_VALUE_SIGNATURE, serial.serialize())
def deserialize_list_value(self, data, top_level=False):
serial = Deserializer(data)
result = []
while len(serial):
value = self.deserialize_value(serial)
if not isinstance(value, IonNop):
result.append(value)
return result
SEXP_VALUE_SIGNATURE = 12
def serialize_sexp_value(self, value):
return (
IonBinary.SEXP_VALUE_SIGNATURE,
self.serialize_list_value(list(value))[1],
)
def deserialize_sexp_value(self, data):
return IonSExp(self.deserialize_list_value(data))
STRUCT_VALUE_SIGNATURE = 13
def serialize_struct_value(self, value):
serial = Serializer()
for key, val in value.items():
serial.append(serialize_vluint(self.symtab.get_id(key)))
serial.append(self.serialize_value(val))
return (IonBinary.STRUCT_VALUE_SIGNATURE, serial.serialize())
def deserialize_struct_value(self, flag, serial):
if flag == IonBinary.SORTED_STRUCT_FLAG:
log.error("BinaryIonStruct: Sorted IonStruct encountered")
flag = IonBinary.VARIABLE_LEN_FLAG
serial2 = Deserializer(
serial.extract(
deserialize_vluint(serial)
if flag == IonBinary.VARIABLE_LEN_FLAG
else flag
)
)
result = IonStruct()
while len(serial2):
id_symbol = self.symtab.get_symbol(deserialize_vluint(serial2))
value = self.deserialize_value(serial2)
if DEBUG:
log.debug("IonStruct: %s = %s" % (repr(id_symbol), repr(value)))
if not isinstance(value, IonNop):
if id_symbol in result:
log.error("BinaryIonStruct: Duplicate field name %s" % id_symbol)
result[id_symbol] = value
return result
ANNOTATION_VALUE_SIGNATURE = 14
def serialize_annotation_value(self, value):
if not value.annotations:
raise Exception("Serializing IonAnnotation without annotations")
serial = Serializer()
annotation_data = Serializer()
for annotation in value.annotations:
annotation_data.append(serialize_vluint(self.symtab.get_id(annotation)))
serial.append(serialize_vluint(len(annotation_data)))
serial.append(annotation_data.serialize())
serial.append(self.serialize_value(value.value))
return (IonBinary.ANNOTATION_VALUE_SIGNATURE, serial.serialize())
def deserialize_annotation_value(self, data):
serial = Deserializer(data)
annotation_length = deserialize_vluint(serial)
annotation_data = Deserializer(serial.extract(annotation_length))
ion_value = self.deserialize_value(serial)
if len(serial):
raise Exception(
"IonAnnotation has excess data: %s"
% bytes_to_separated_hex(serial.extract())
)
annotations = []
while len(annotation_data):
annotations.append(
self.symtab.get_symbol(deserialize_vluint(annotation_data))
)
if len(annotations) == 0:
raise Exception("IonAnnotation has no annotations")
return IonAnnotation(annotations, ion_value)
RESERVED_VALUE_SIGNATURE = 15
def deserialize_reserved_value(self, data):
raise Exception(
"Deserialize reserved ion value signature %d" % self.value_signature
)
VALUE_DESERIALIZERS = {
NULL_VALUE_SIGNATURE: (False, deserialize_null_value, "null"),
BOOL_VALUE_SIGNATURE: (False, deserialize_bool_value, "bool"),
POSINT_VALUE_SIGNATURE: (True, deserialize_posint_value, "int"),
NEGINT_VALUE_SIGNATURE: (True, deserialize_negint_value, "int"),
FLOAT_VALUE_SIGNATURE: (True, deserialize_float_value, "float"),
DECIMAL_VALUE_SIGNATURE: (True, deserialize_decimal_value, "decimal"),
TIMESTAMP_VALUE_SIGNATURE: (True, deserialize_timestamp_value, "timestamp"),
SYMBOL_VALUE_SIGNATURE: (True, deserialize_symbol_value, "symbol"),
STRING_VALUE_SIGNATURE: (True, deserialize_string_value, "string"),
CLOB_VALUE_SIGNATURE: (True, deserialize_clob_value, "clob"),
BLOB_VALUE_SIGNATURE: (True, deserialize_blob_value, "blob"),
LIST_VALUE_SIGNATURE: (True, deserialize_list_value, "list"),
SEXP_VALUE_SIGNATURE: (True, deserialize_sexp_value, "sexp"),
STRUCT_VALUE_SIGNATURE: (False, deserialize_struct_value, "struct"),
ANNOTATION_VALUE_SIGNATURE: (True, deserialize_annotation_value, "annotation"),
RESERVED_VALUE_SIGNATURE: (True, deserialize_reserved_value, "reserved"),
}
ION_TYPE_HANDLERS = {
IonAnnotation: serialize_annotation_value,
IonBLOB: serialize_blob_value,
IonBool: serialize_bool_value,
IonCLOB: serialize_clob_value,
IonDecimal: serialize_decimal_value,
IonFloat: serialize_float_value,
IonInt: serialize_int_value,
IonList: serialize_list_value,
IonNull: serialize_null_value,
IonSExp: serialize_sexp_value,
IonString: serialize_string_value,
IonStruct: serialize_struct_value,
IonSymbol: serialize_symbol_value,
IonTimestamp: serialize_timestamp_value,
}
def descriptor(signature, flag):
if flag < 0 or flag > 0x0F:
raise Exception("Serialize bad descriptor flag: %d" % flag)
return bytes_([(signature << 4) + flag])
def serialize_unsignedint(value):
return ltrim0(struct.pack(">Q", value))
def deserialize_unsignedint(data):
if len(data) > 0 and bytes_indexed(data, 0) == 0:
raise Exception("BinaryIonInt data padded with 0x00")
return struct.unpack_from(">Q", lpad0(data, 8))[0]
def serialize_signedint(value):
data = ltrim0x(struct.pack(">Q", abs(value)))
if value < 0:
data = or_first_byte(data, 0x80)
return data
def deserialize_signedint(data):
if len(data) == 0:
return 0
if (bytes_indexed(data, 0) & 0x80) != 0:
return -(struct.unpack_from(">Q", lpad0(and_first_byte(data, 0x7F), 8))[0])
return struct.unpack_from(">Q", lpad0(data, 8))[0]
def serialize_vluint(value):
if value < 0:
raise Exception("Cannot serialize negative value as IonVLUInt: %d" % value)
datalst = [(value & 0x7F) + 0x80]
while True:
value = value >> 7
if value == 0:
return bytes_(datalst)
datalst.insert(0, value & 0x7F)
def deserialize_vluint(serial):
value = 0
while True:
i = serial.unpack("B")
value = (value << 7) | (i & 0x7F)
if i & 0x80:
return value
if value == 0:
raise Exception("IonVLUInt padded with 0x00")
if value > 0x7FFFFFFFFFFFFF:
raise Exception("IonVLUInt data value is too large, missing terminator")
def serialize_vlsint(value):
if value is None:
return b"\xc0"
data = serialize_vluint(abs(value))
if bytes_indexed(data, 0) & 0x40:
data = b"\x00" + data
if value < 0:
data = or_first_byte(data, 0x40)
return data
def deserialize_vlsint(serial, allow_minus_zero=False):
first = serial.unpack("B")
ibyte = first & 0xBF
datalst = []
if ibyte != 0:
datalst.append(ibyte)
while (ibyte & 0x80) == 0:
ibyte = serial.unpack("B")
datalst.append(ibyte)
value = deserialize_vluint(Deserializer(bytes_(datalst)))
if first & 0x40:
if value:
value = -value
elif allow_minus_zero:
value = None
else:
raise Exception("deserialize_vlsint unexpected -0 value")
return value
def lpad0(data, size):
if len(data) > size:
extra = len(data) - size
if data[:size] != b"\x00" * extra:
raise Exception(
"lpad0, length (%d) > max (%d): %s"
% (len(data), size, bytes_to_separated_hex(data))
)
return data[:size]
return b"\x00" * (size - len(data)) + data
def ltrim0(data):
while len(data) and bytes_indexed(data, 0) == 0:
data = data[1:]
return data
def ltrim0x(data):
while len(data) and bytes_indexed(data, 0) == 0:
if len(data) > 1 and (bytes_indexed(data, 1) & 0x80):
break
data = data[1:]
return data
def combine_decimal_digits(digits, sign_negative):
val = 0
for digit in digits:
val = (val * 10) + digit
if sign_negative:
val = -val
return val
def and_first_byte(data, mask):
return bytes_([bytes_indexed(data, 0) & mask]) + data[1:]
def or_first_byte(data, mask):
return bytes_([bytes_indexed(data, 0) | mask]) + data[1:]

View File

@@ -0,0 +1,460 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import re
from .ion import (
IS,
IonAnnotation,
IonStruct,
IonSymbol,
ion_type,
isstring,
unannotated,
)
from .message_logging import log
from .python_transition import IS_PYTHON2
from .utilities import list_symbols, quote_name, type_name
from .yj_symbol_catalog import SYSTEM_SYMBOL_TABLE, YJ_SYMBOLS, IonSharedSymbolTable
if IS_PYTHON2:
from .python_transition import repr
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
DEBUG = False
REPORT_ALL_USED_SYMBOLS = False
class SymbolTableCatalog(object):
def __init__(self, add_global_shared_symbol_tables=False):
self.shared_symbol_tables = {}
self.clear()
if add_global_shared_symbol_tables:
self.add_global_shared_symbol_tables()
def clear(self):
self.shared_symbol_tables.clear()
self.add_shared_symbol_table(SYSTEM_SYMBOL_TABLE)
def add_global_shared_symbol_tables(self):
self.add_shared_symbol_table(YJ_SYMBOLS)
def add_shared_symbol_table(self, shared_symbol_table):
self.shared_symbol_tables[
(shared_symbol_table.name, shared_symbol_table.version)
] = shared_symbol_table
if (
shared_symbol_table.name not in self.shared_symbol_tables
or shared_symbol_table.version
>= self.shared_symbol_tables[(shared_symbol_table.name, None)].version
):
self.shared_symbol_tables[
(shared_symbol_table.name, None)
] = shared_symbol_table
def create_shared_symbol_table(self, symbol_table_data):
self.add_shared_symbol_table(
IonSharedSymbolTable(
symbol_table_data["name"],
symbol_table_data["version"] if "version" in symbol_table_data else 1,
symbol_table_data["symbols"] if "symbols" in symbol_table_data else [],
)
)
def get_shared_symbol_table(self, name, version=None):
return self.shared_symbol_tables.get(
(name, version)
) or self.shared_symbol_tables.get((name, None))
global_catalog = SymbolTableCatalog(add_global_shared_symbol_tables=True)
class SymbolTableImport(object):
def __init__(self, name, version, max_id):
self.name = name
self.version = version
self.max_id = max_id
class LocalSymbolTable(object):
def __init__(
self,
initial_import=None,
context="",
ignore_undef=False,
catalog=global_catalog,
):
self.context = context
self.ignore_undef = ignore_undef
self.catalog = catalog
self.undefined_ids = set()
self.undefined_symbols = set()
self.unexpected_used_symbols = set()
self.reported = False
self.clear()
self.set_translation(None)
if initial_import:
self.import_shared_symbol_table(initial_import)
def clear(self):
self.table_imports = []
self.symbols = []
self.id_of_symbol = {}
self.symbol_of_id = {}
self.unexpected_ids = set()
self.creating_local_symbols = False
self.creating_yj_local_symbols = False
self.import_symbols(self.catalog.get_shared_symbol_table("$ion").symbols)
self.local_min_id = len(self.symbols) + 1
def create(self, symbol_table_data, yj_local_symbols=False):
if "imports" in symbol_table_data:
imports = symbol_table_data["imports"]
if ion_type(imports) is IonSymbol:
if imports != "$ion_symbol_table":
raise Exception("Unexpected imports value: %s" % imports)
else:
self.clear()
for sym_import in imports:
self.import_shared_symbol_table(
sym_import["name"],
sym_import.get("version") or 1,
sym_import.get("max_id"),
)
else:
self.clear()
symbol_list = (
unannotated(symbol_table_data["symbols"])
if "symbols" in symbol_table_data
else []
)
self.creating_local_symbols = True
self.import_symbols(symbol_list)
if "max_id" in symbol_table_data:
expected_max_id = symbol_table_data["max_id"]
if expected_max_id is not None and expected_max_id != len(self.symbols):
log.error(
"Symbol table max_id after import expected %d, found %d"
% (expected_max_id, len(self.symbols))
)
def import_shared_symbol_table(self, name, version=None, max_id=None):
if DEBUG:
log.debug(
"Importing ion symbol table %s version %s max_id %s"
% (quote_name(name), version, max_id)
)
if self.creating_local_symbols:
raise Exception(
"Importing shared symbols after local symbols have been created"
)
if name == "$ion":
return
symbol_table = self.catalog.get_shared_symbol_table(name, version)
if symbol_table is None:
log.error("Imported shared symbol table %s is unknown" % name)
symbol_table = IonSharedSymbolTable(name=name, version=version)
if version is None:
version = symbol_table.version
elif symbol_table.version != version:
if max_id is None:
log.error(
"Import version %d of shared symbol table %s without max_id, but have version %d"
% (version, name, symbol_table.version)
)
else:
log.warning(
"Import version %d of shared symbol table %s, but have version %d"
% (version, name, symbol_table.version)
)
table_len = len(symbol_table.symbols)
if max_id is None:
max_id = table_len
if max_id < 0:
raise Exception(
"Import symbol table %s version %d max_id %d is invalid"
% (name, version, max_id)
)
self.table_imports.append(SymbolTableImport(name, version, max_id))
if max_id < table_len:
symbol_list = symbol_table.symbols[:max_id]
elif max_id > table_len:
if table_len > 0:
prior_len = len(self.symbols)
log.warning(
"Import symbol table %s version %d max_id %d(+%d=%d) exceeds known table size %d(+%d=%d)"
% (
name,
version,
max_id,
prior_len,
max_id + prior_len,
table_len,
prior_len,
table_len + prior_len,
)
)
symbol_list = symbol_table.symbols + ([None] * (max_id - table_len))
else:
symbol_list = symbol_table.symbols
self.import_symbols(symbol_list)
self.local_min_id = len(self.symbols) + 1
def import_symbols(self, symbols):
for symbol in symbols:
symbol = unannotated(symbol)
if symbol is not None:
if not isstring(symbol):
log.error(
"imported symbol %s is type %s, treating as null"
% (symbol, type_name(symbol))
)
symbol = None
self.add_symbol(symbol)
def create_local_symbol(self, symbol):
self.creating_local_symbols = True
if symbol not in self.id_of_symbol:
self.add_symbol(symbol)
return IonSymbol(symbol)
def add_symbol(self, symbol):
if symbol is None:
self.symbols.append(None)
return -1
if not isstring(symbol):
raise Exception(
"symbol %s is type %s, not string" % (symbol, type_name(symbol))
)
if len(symbol) == 0:
raise Exception("symbol has zero length")
expected = True
if not self.creating_local_symbols:
if symbol.endswith("?"):
symbol = symbol[:-1]
expected = False
elif REPORT_ALL_USED_SYMBOLS:
expected = False
self.symbols.append(symbol)
if symbol not in self.id_of_symbol:
symbol_id = len(self.symbols)
self.id_of_symbol[symbol] = symbol_id
self.symbol_of_id[symbol_id] = symbol
else:
self.symbol_of_id[len(self.symbols)] = symbol
symbol_id = self.id_of_symbol[symbol]
log.error("Symbol %s already exists with id %d" % (symbol, symbol_id))
if not expected:
self.unexpected_ids.add(symbol_id)
return symbol_id
def get_symbol(self, symbol_id):
if not isinstance(symbol_id, int):
raise Exception(
"get_symbol: symbol id must be integer not %s: %s"
% (type_name(symbol_id), repr(symbol_id))
)
symbol = self.symbol_of_id.get(symbol_id)
if symbol is None:
symbol = "$%d" % symbol_id
self.undefined_ids.add(symbol_id)
if symbol_id in self.unexpected_ids:
self.unexpected_used_symbols.add(symbol)
return IonSymbol(symbol)
def get_id(self, ion_symbol, used=True):
if not isinstance(ion_symbol, IonSymbol):
raise Exception(
"get_id: symbol must be IonSymbol not %s: %s"
% (type_name(ion_symbol), repr(ion_symbol))
)
symbol = ion_symbol.tostring()
if symbol.startswith("$") and re.match(r"^\$[0-9]+$", symbol):
symbol_id = int(symbol[1:])
if symbol_id not in self.symbol_of_id:
self.undefined_ids.add(symbol_id)
else:
symbol_id = self.id_of_symbol.get(symbol)
if symbol_id is None:
if used:
self.undefined_symbols.add(symbol)
symbol_id = 0
if used and symbol_id in self.unexpected_ids:
self.unexpected_used_symbols.add(symbol)
return symbol_id
def is_shared_symbol(self, ion_symbol):
symbol_id = self.get_id(ion_symbol, used=False)
return symbol_id > 0 and symbol_id < self.local_min_id
def is_local_symbol(self, ion_symbol):
return self.get_id(ion_symbol, used=False) >= self.local_min_id
def replace_local_symbols(self, new_symbols):
self.discard_local_symbols()
self.import_symbols(new_symbols)
def get_local_symbols(self):
return self.symbols[self.local_min_id - 1 :]
def discard_local_symbols(self):
symbol_id = self.local_min_id
for symbol in self.symbols[self.local_min_id - 1 :]:
self.id_of_symbol.pop(symbol)
self.symbol_of_id.pop(symbol_id)
symbol_id += 1
self.symbols = self.symbols[: self.local_min_id - 1]
def create_import(self, imports_only=False):
if not self.symbols:
return None
symbol_table_data = IonStruct()
if not imports_only:
symbol_table_data[IS("max_id")] = len(self.symbols)
symbol_table_data[IS("imports")] = [
IonStruct(
IS("name"),
table_import.name,
IS("version"),
table_import.version,
IS("max_id"),
table_import.max_id,
)
for table_import in self.table_imports
]
if not imports_only:
symbol_table_data[IS("symbols")] = self.symbols[self.local_min_id - 1 :]
return IonAnnotation([IS("$ion_symbol_table")], symbol_table_data)
def set_translation(self, alt_symbol_table):
self.import_translate = {}
self.export_translate = {}
if alt_symbol_table is None:
return
offset = len(self.catalog.get_shared_symbol_table("$ion").symbols) + 1
for table_import in self.table_imports:
if table_import.name == alt_symbol_table.name:
orig_symbol_table = self.catalog.get_shared_symbol_table(
table_import.name, table_import.version
)
for idx in range(
max(len(orig_symbol_table.symbols), len(alt_symbol_table.symbols))
):
have_orig = idx < len(orig_symbol_table.symbols)
have_alt = idx < len(alt_symbol_table.symbols)
orig_symbol = (
orig_symbol_table.symbols[idx]
if have_orig
else "$%d" % (idx + offset)
)
if orig_symbol.endswith("?"):
orig_symbol = orig_symbol[:-1]
alt_symbol = (
alt_symbol_table.symbols[idx]
if have_alt
else "$%d" % (idx + offset)
)
if have_alt:
self.import_translate[alt_symbol] = orig_symbol
if have_orig:
self.export_translate[orig_symbol] = alt_symbol
break
offset += table_import.max_id
def __repr__(self):
return "symbols: %s; id_of_symbol %s; symbol_of_id %s" % (
repr(self.symbols),
repr(self.id_of_symbol),
repr(self.symbol_of_id),
)
def report(self):
if self.reported:
return
context = ("%s: " % self.context) if self.context else ""
if self.unexpected_used_symbols:
log.error(
"%sUnexpected Ion symbols used: %s"
% (context, list_symbols(self.unexpected_used_symbols))
)
if self.undefined_symbols and not self.ignore_undef:
log.error(
"%sUndefined Ion symbols found: %s"
% (
context,
", ".join([quote_name(s) for s in sorted(self.undefined_symbols)]),
)
)
if self.undefined_ids:
log.error(
"%sUndefined Ion symbol IDs found: %s"
% (context, list_symbols(self.undefined_ids))
)
self.reported = True

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,164 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import sys
import uuid
from .jxr_image import JXRImage
from .jxr_misc import Deserializer, bytes_to_separated_hex
from .message_logging import log
if sys.version_info[0] == 2:
str = type("")
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
FIELD_TYPE_LEN = {
1: 1,
2: 1,
3: 2,
4: 4,
5: 8,
6: 1,
7: 1,
8: 2,
9: 4,
10: 8,
11: 4,
12: 8,
}
LEN_FMT = {
1: "B",
2: "s",
3: "<H",
4: "<L",
6: "b",
7: "s",
8: "<h",
9: "<l",
11: "<f",
12: "<d",
}
SUPPORTED_PIXEL_FORMATS = {
"24c3dd6f-034e-fe4b-b185-3d77768dc905": "BlackWhite",
"24c3dd6f-034e-fe4b-b185-3d77768dc908": "8bppGray",
"24c3dd6f-034e-fe4b-b185-3d77768dc90b": "16bppGray",
"24c3dd6f-034e-fe4b-b185-3d77768dc90c": "24bppBGR",
"24c3dd6f-034e-fe4b-b185-3d77768dc90d": "24bppRGB",
"24c3dd6f-034e-fe4b-b185-3d77768dc90f": "32bppRGBA",
"24c3dd6f-034e-fe4b-b185-3d77768dc920": "24bpp3Channels",
"24c3dd6f-034e-fe4b-b185-3d77768dc921": "32bpp4Channels",
}
class JXRContainer(object):
def __init__(self, data):
header = Deserializer(data)
tif_signature = header.extract(4)
if tif_signature != b"\x49\x49\xbc\x01":
raise Exception(
"TIF signature is incorrect: %s" % bytes_to_separated_hex(tif_signature)
)
ifd_offset = header.unpack("<L", "ifd_offset")
header.extract(ifd_offset - header.offset)
pixel_format = ""
self.image_width = (
self.image_height
) = image_offset = image_byte_count = self.image_data = None
num_entries = header.unpack("<H", "num_entries")
def field_value():
return Deserializer(field_data).unpack(LEN_FMT[field_type], "field_value")
for i in range(num_entries):
field_tag = header.unpack("<H", "field_tag")
field_type = header.unpack("<H", "field_type")
field_count = header.unpack("<L", "field_count")
field_data_len = FIELD_TYPE_LEN[field_type] * field_count
if field_data_len <= 4:
field_data = header.extract(field_data_len)
header.extract(4 - field_data_len)
else:
field_data_or_offset = header.unpack("<L", "field_data_or_offset")
field_data = data[
field_data_or_offset : field_data_or_offset + field_data_len
]
if field_tag == 0xBC01:
pixel_format = str(uuid.UUID(bytes=field_data))
elif field_tag == 0xBC80:
self.image_width = field_value()
elif field_tag == 0xBC81:
self.image_height = field_value()
elif field_tag == 0xBCC0:
image_offset = field_value()
elif field_tag == 0xBCC1:
image_byte_count = field_value()
if not (
pixel_format
and self.image_width
and self.image_height
and image_offset
and (image_byte_count is not None)
):
raise Exception(
"Missing required TIFF field tag: pixel_format=%s width=%s height=%s offset=%s byte-count=%s"
% (
pixel_format,
self.image_width,
self.image_height,
image_offset,
image_byte_count,
)
)
if pixel_format not in SUPPORTED_PIXEL_FORMATS:
log.warning("Unsupported pixel format: %s" % pixel_format)
ifd_offset = header.unpack("<L", "ifd_offset")
if ifd_offset != 0:
raise Exception(
"File contains multiple images - only a single image is supported"
)
if image_byte_count > 0:
self.image_data = data[image_offset : image_offset + image_byte_count]
if len(self.image_data) < image_byte_count:
log.warning(
"File is truncated (missing %d bytes of image data)"
% (image_byte_count - len(self.image_data))
)
else:
self.image_data = data[image_offset:]
def unpack_image(self):
jxr_image = JXRImage(self.image_data)
im = jxr_image.decode()
if (
jxr_image.image_width != self.image_width
or jxr_image.image_height != self.image_height
):
log.warning(
"Expected image size %dx%d but found %dx%d"
% (
self.image_width,
self.image_height,
jxr_image.image_width,
jxr_image.image_height,
)
)
return im

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,122 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import struct
from .message_logging import log
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
DEBUG = False
class Deserializer(object):
def __init__(self, data):
self.buffer = data
self.offset = 0
self.bits_remaining = self.remainder = 0
def extract(self, size=None, upto=None, advance=True, check_remaining=True):
if check_remaining and self.bits_remaining:
raise Exception(
"Deserializer: unexpected %d bit remaining" % self.bits_remaining
)
if size is None:
size = len(self) if upto is None else (upto - self.offset)
data = self.buffer[self.offset : self.offset + size]
if len(data) < size or size < 0:
raise Exception(
"Deserializer: Insufficient data (need %d bytes, have %d bytes)"
% (size, len(data))
)
if advance:
self.offset += size
return data
def unpack(self, fmt, name="", advance=True):
if self.bits_remaining:
raise Exception(
"Deserializer: unexpected %d bit remaining" % self.bits_remaining
)
result = struct.unpack_from(fmt, self.buffer, self.offset)[0]
if DEBUG:
log.info("%d: unpack(%s)=%s %s" % (self.offset, fmt, repr(result), name))
if advance:
self.offset += struct.calcsize(fmt)
return result
def unpack_bits(self, size, name=""):
while self.bits_remaining < size:
self.remainder = (self.remainder << 8) + ord(
self.extract(1, check_remaining=False)
)
self.bits_remaining += 8
self.bits_remaining -= size
value = self.remainder >> self.bits_remaining
if value > (1 << size) - 1:
raise Exception()
self.remainder = self.remainder & (0xFF >> (8 - self.bits_remaining))
if DEBUG:
log.info(
"%d: unpack_bits(%d)=%u (%s) %s"
% (self.offset, size, value, ("{0:0%sb}" % size).format(value), name)
)
return value
def unpack_flag(self, name=""):
return self.unpack_bits(1, name) == 1
def push_bit(self, value):
self.remainder &= (value & 1) << self.bits_remaining
self.bits_remaining += 1
def check_bit_field(self, size, name, expected_values, name_table={}):
def value_name(v):
return name_table.get(v, "%d" % v)
value = self.unpack_bits(size, name)
if value not in expected_values:
msg = "%s value %s is unsupported (only %s allowed)" % (
name,
value_name(value),
", ".join([value_name(ev) for ev in expected_values]),
)
raise Exception(msg)
return value
def huff(self, table, name):
k = 1
while k <= 0xFF:
k = (k << 1) + self.unpack_bits(1, name)
v = table.get(k)
if v is not None:
return v
raise Exception("decode using huffman table failed")
def discard_remainder_bits(self):
self.bits_remaining = self.remainder = 0
def __len__(self):
return len(self.buffer) - self.offset
def bytes_to_separated_hex(data, sep=" "):
return sep.join("%02x" % ord(data[i : i + 1]) for i in range(len(data)))

View File

@@ -0,0 +1,598 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import copy
from .ion import IS, IonAnnotation, IonBLOB, IonStruct
from .ion_binary import IonBinary
from .message_logging import log
from .python_transition import IS_PYTHON2, bytes_to_hex
from .utilities import (
Deserializer,
Serializer,
bytes_to_separated_hex,
json_deserialize,
json_serialize_compact,
sha1,
type_name,
)
from .yj_container import (
CONTAINER_FORMAT_KFX_ATTACHABLE,
CONTAINER_FORMAT_KFX_MAIN,
CONTAINER_FORMAT_KFX_METADATA,
CONTAINER_FRAGMENT_TYPES,
DRMION_SIGNATURE,
RAW_FRAGMENT_TYPES,
YJContainer,
YJFragment,
)
from .yj_symbol_catalog import SYSTEM_SYMBOL_TABLE
if IS_PYTHON2:
from .python_transition import repr
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
DEBUG = False
MAX_KFX_CONTAINER_SIZE = 16 * 1024 * 1024
DEFAULT_COMPRESSION_TYPE = 0
DEFAULT_DRM_SCHEME = 0
KFX_MAIN_CONTAINER_FRAGMENT_IDNUMS = {
259,
260,
538,
}
KFX_METADATA_CONTAINER_FRAGMENT_IDNUMS = {
258,
419,
490,
585,
}
KFX_ATTACHABLE_CONTAINER_FRAGMENT_IDNUMS = {
417,
}
class KfxContainer(YJContainer):
SIGNATURE = b"CONT"
DRM_SIGNATURE = DRMION_SIGNATURE
VERSION = 2
ALLOWED_VERSIONS = {1, 2}
MIN_LENGTH = 18
DEFAULT_CHUNK_SIZE = 4096
def __init__(self, symtab, datafile=None, fragments=None):
YJContainer.__init__(self, symtab, datafile=datafile, fragments=fragments)
def deserialize(self, ignore_drm=False):
self.doc_symbols = None
self.format_capabilities = None
self.container_info = None
self.entities = []
self.fragments.clear()
data = self.datafile.get_data()
if len(data) < KfxContainer.MIN_LENGTH:
raise Exception("Container is too short (%d bytes)" % len(data))
header = Deserializer(data)
signature = header.unpack("4s")
version = header.unpack("<H")
header_len = header.unpack("<L")
if signature != KfxContainer.SIGNATURE:
pdb_creator = data[64:68]
if pdb_creator in [b"MOBI", b"CONT"]:
raise Exception(
"Found a PDB %s container. This book is not in KFX format."
% pdb_creator.decode("utf8")
)
raise Exception(
"Container signature is incorrect (%s)"
% bytes_to_separated_hex(signature)
)
if version not in KfxContainer.ALLOWED_VERSIONS:
log.error("Container version is incorrect (%d)" % version)
if header_len < KfxContainer.MIN_LENGTH:
raise Exception("Container header is too short (%d)" % header_len)
container_info_offset = header.unpack(b"<L")
container_info_length = header.unpack(b"<L")
container_info_data = data[
container_info_offset : container_info_offset + container_info_length
]
container_info = IonBinary(self.symtab).deserialize_single_value(
container_info_data
)
if DEBUG:
log.debug("container info:\n%s" % repr(container_info))
container_id = container_info.pop("$409", "")
compression_type = container_info.pop("$410", DEFAULT_COMPRESSION_TYPE)
if compression_type != DEFAULT_COMPRESSION_TYPE:
log.error(
"Unexpected bcComprType in container %s info: %s"
% (container_id, repr(compression_type))
)
drm_scheme = container_info.pop("$411", DEFAULT_DRM_SCHEME)
if drm_scheme != DEFAULT_DRM_SCHEME:
log.error(
"Unexpected bcDRMScheme in container %s info: %s"
% (container_id, repr(drm_scheme))
)
doc_symbol_offset = container_info.pop("$415", None)
doc_symbol_length = container_info.pop("$416", 0)
if doc_symbol_length:
doc_symbol_data = data[
doc_symbol_offset : doc_symbol_offset + doc_symbol_length
]
self.doc_symbols = IonBinary(self.symtab).deserialize_annotated_value(
doc_symbol_data, expect_annotation="$ion_symbol_table"
)
if DEBUG:
log.debug("Document symbols:\n%s" % repr(self.doc_symbols))
for sym_import in self.doc_symbols.value["imports"]:
if "max_id" in sym_import:
sym_import["max_id"] -= len(SYSTEM_SYMBOL_TABLE.symbols)
self.symtab.create(self.doc_symbols.value)
chunk_size = container_info.pop("$412", KfxContainer.DEFAULT_CHUNK_SIZE)
if chunk_size != KfxContainer.DEFAULT_CHUNK_SIZE:
log.warning(
"Unexpected bcChunkSize in container %s info: %d"
% (container_id, chunk_size)
)
if version > 1:
format_capabilities_offset = container_info.pop("$594", None)
format_capabilities_length = container_info.pop("$595", 0)
if format_capabilities_length:
format_capabilities_data = data[
format_capabilities_offset : format_capabilities_offset
+ format_capabilities_length
]
self.format_capabilities = IonBinary(
self.symtab
).deserialize_annotated_value(
format_capabilities_data, expect_annotation="$593"
)
if DEBUG:
log.debug(
"Format capabilities:\n%s" % repr(self.format_capabilities)
)
type_idnums = set()
index_table_offset = container_info.pop("$413", None)
index_table_length = container_info.pop("$414", 0)
if len(container_info):
log.error("container_info has extra data: %s" % repr(container_info))
payload_sha1 = bytes_to_hex(sha1(data[header_len:]))
kfxgen_package_version = ""
kfxgen_application_version = ""
kfxgen_info_data = (
data[container_info_offset + container_info_length : header_len]
.replace(b"\x1b", b"")
.decode("ascii", errors="ignore")
)
kfxgen_info_json = (
kfxgen_info_data.replace("key :", '"key":')
.replace("key:", '"key":')
.replace("value:", '"value":')
)
try:
kfxgen_info = json_deserialize(kfxgen_info_json)
except Exception:
log.info("Exception decoding json: %s" % kfxgen_info_json)
raise
for info in kfxgen_info:
key = info.pop("key")
value = info.pop("value")
if key in {"appVersion", "kfxgen_application_version"}:
kfxgen_application_version = value
elif key in {"buildVersion", "kfxgen_package_version"}:
kfxgen_package_version = value
elif key == "kfxgen_payload_sha1":
if value != payload_sha1:
log.error(
"Incorrect kfxgen_payload_sha1 in container %s" % container_id
)
log.info("value=%s sha1=%s" % (value, payload_sha1))
elif key == "kfxgen_acr":
if value != container_id:
log.error(
"Unexpected kfxgen_acr in container %s: %s"
% (container_id, value)
)
else:
log.error("kfxgen_info has unknown key: %s = %s" % (key, value))
if len(info):
log.error("kfxgen_info has extra data: %s" % repr(info))
if index_table_length:
entity_table = Deserializer(
data[index_table_offset : index_table_offset + index_table_length]
)
while len(entity_table):
id_idnum = entity_table.unpack("<L")
type_idnum = entity_table.unpack("<L")
entity_offset = entity_table.unpack("<Q")
entity_len = entity_table.unpack("<Q")
type_idnums.add(type_idnum)
entity_start = header_len + entity_offset
if DEBUG:
log.debug(
"Container entity: id=%d type=%d len=%d"
% (id_idnum, type_idnum, entity_len)
)
if entity_start + entity_len > len(data):
raise Exception(
"Container %s (%d bytes) is not large enough for entity end (offset %d)"
% (container_id, len(data), entity_start + entity_len)
)
self.entities.append(
KfxContainerEntity(
self.symtab,
id_idnum,
type_idnum,
serialized_data=data[entity_start : entity_start + entity_len],
)
)
if type_idnums & KFX_MAIN_CONTAINER_FRAGMENT_IDNUMS:
container_format = CONTAINER_FORMAT_KFX_MAIN
elif (type_idnums & KFX_METADATA_CONTAINER_FRAGMENT_IDNUMS) or (
doc_symbol_length > 0
):
container_format = CONTAINER_FORMAT_KFX_METADATA
elif type_idnums & KFX_ATTACHABLE_CONTAINER_FRAGMENT_IDNUMS:
container_format = CONTAINER_FORMAT_KFX_ATTACHABLE
else:
log.error("Cannot determine KFX container type of %s" % container_id)
container_format = "KFX unknown"
self.container_info = IonAnnotation(
[IS("$270")],
IonStruct(
IS("$409"),
container_id,
IS("$412"),
chunk_size,
IS("$410"),
compression_type,
IS("$411"),
drm_scheme,
IS("$587"),
kfxgen_application_version,
IS("$588"),
kfxgen_package_version,
IS("$161"),
container_format,
IS("version"),
version,
IS("$181"),
[[e.type_idnum, e.id_idnum] for e in self.entities],
),
)
self.container_id = container_id
def get_fragments(self):
if not self.fragments:
for data in [
self.doc_symbols,
self.container_info,
self.format_capabilities,
]:
if data is not None:
self.fragments.append(YJFragment(data))
for entity in self.entities:
self.fragments.append(entity.deserialize())
return self.fragments
def serialize(self):
container_id = None
kfxgen_package_version = ""
kfxgen_application_version = ""
doc_symbols = None
format_capabilities = None
container_cnt = (
format_capabilities_cnt
) = ion_symbol_table_cnt = container_entity_map_cnt = 0
for fragment in self.get_fragments():
if fragment.ftype == "$270":
container_cnt += 1
container_id = fragment.value.get("$409", "")
kfxgen_application_version = fragment.value.get("$587", "")
kfxgen_package_version = fragment.value.get("$588", "")
elif fragment.ftype == "$593":
format_capabilities_cnt += 1
format_capabilities = fragment
elif fragment.ftype == "$ion_symbol_table":
ion_symbol_table_cnt += 1
doc_symbols = fragment
doc_symbols = YJFragment(
doc_symbols.annotations, value=copy.deepcopy(doc_symbols.value)
)
for sym_import in doc_symbols.value["imports"]:
if "max_id" in sym_import:
sym_import["max_id"] += len(SYSTEM_SYMBOL_TABLE.symbols)
elif fragment.ftype == "$419":
container_entity_map_cnt += 1
if (
container_cnt != 1
or format_capabilities_cnt > 1
or ion_symbol_table_cnt != 1
or container_entity_map_cnt != 1
):
log.error(
"Missing/extra fragments required to build KFX container: "
"container=%d format_capabilities=%d ion_symbol_table=%d container_entity_map=%d"
% (
container_cnt,
format_capabilities_cnt,
ion_symbol_table_cnt,
container_entity_map_cnt,
)
)
entities = []
for fragment in self.fragments:
if (fragment.ftype not in CONTAINER_FRAGMENT_TYPES) or (
fragment.ftype == "$419"
):
entities.append(
KfxContainerEntity(
self.symtab,
id_idnum=self.symtab.get_id(
IS("$348") if fragment.is_single() else fragment.fid
),
type_idnum=self.symtab.get_id(fragment.ftype),
value=fragment.value,
)
)
container = Serializer()
container.pack("4s", KfxContainer.SIGNATURE)
container.pack("<H", KfxContainer.VERSION)
header_len_pack = container.pack("<L", 0)
container_info_offset_pack = container.pack("<L", 0)
container_info_length_pack = container.pack("<L", 0)
container_info = IonStruct()
container_info[IS("$409")] = container_id
container_info[IS("$410")] = DEFAULT_COMPRESSION_TYPE
container_info[IS("$411")] = DEFAULT_DRM_SCHEME
entity_data = Serializer()
entity_table = Serializer()
entity_offset = 0
for entity in entities:
serialized_entity = entity.serialize()
entity_data.append(serialized_entity)
entity_len = len(serialized_entity)
entity_table.pack("<L", entity.id_idnum)
entity_table.pack("<L", entity.type_idnum)
entity_table.pack("<Q", entity_offset)
entity_table.pack("<Q", entity_len)
entity_offset += entity_len
container_info[IS("$413")] = len(container)
container_info[IS("$414")] = len(entity_table)
container.append(entity_table.serialize())
if doc_symbols is not None:
doc_symbol_data = IonBinary(self.symtab).serialize_single_value(doc_symbols)
else:
doc_symbol_data = b""
container_info[IS("$415")] = len(container)
container_info[IS("$416")] = len(doc_symbol_data)
container.append(doc_symbol_data)
container_info[IS("$412")] = KfxContainer.DEFAULT_CHUNK_SIZE
if format_capabilities is not None:
format_capabilities_data = IonBinary(self.symtab).serialize_single_value(
format_capabilities
)
else:
format_capabilities_data = b""
if self.symtab.local_min_id > 595:
container_info[IS("$594")] = len(container)
container_info[IS("$595")] = len(format_capabilities_data)
container.append(format_capabilities_data)
container_info_data = IonBinary(self.symtab).serialize_single_value(
container_info
)
container.repack(container_info_length_pack, len(container_info_data))
container.repack(container_info_offset_pack, len(container))
container.append(container_info_data)
kfxgen_info = [
IonStruct("key", "kfxgen_package_version", "value", kfxgen_package_version),
IonStruct(
"key", "kfxgen_application_version", "value", kfxgen_application_version
),
IonStruct(
"key", "kfxgen_payload_sha1", "value", bytes_to_hex(entity_data.sha1())
),
IonStruct("key", "kfxgen_acr", "value", container_id),
]
container.append(
json_serialize_compact(kfxgen_info)
.replace(
'"key":',
"key:",
)
.replace('"value":', "value:")
.encode("ascii")
)
container.repack(header_len_pack, len(container))
container.extend(entity_data)
return container.serialize()
class KfxContainerEntity(object):
SIGNATURE = b"ENTY"
VERSION = 1
ALLOWED_VERSIONS = {1}
MIN_LENGTH = 10
def __init__(
self, symtab, id_idnum=None, type_idnum=None, value=None, serialized_data=None
):
self.symtab = symtab
self.id_idnum = id_idnum
self.type_idnum = type_idnum
self.value = value
self.serialized_data = serialized_data
def deserialize(self, data=None):
if data is None:
data = self.serialized_data
cont_entity = Deserializer(data)
signature = cont_entity.unpack("4s")
version = cont_entity.unpack("<H")
header_len = cont_entity.unpack("<L")
if signature != KfxContainerEntity.SIGNATURE:
raise Exception(
"Container entity signature is incorrect (%s)"
% bytes_to_separated_hex(signature)
)
if version not in KfxContainerEntity.ALLOWED_VERSIONS:
log.error("Container entity version is incorrect (%d)" % version)
if header_len < KfxContainerEntity.MIN_LENGTH:
raise Exception("Container entity header is too short (%d)" % header_len)
self.header = data[:header_len]
entity_info = IonBinary(self.symtab).deserialize_single_value(
cont_entity.extract(upto=header_len)
)
compression_type = entity_info.pop("$410", DEFAULT_COMPRESSION_TYPE)
drm_scheme = entity_info.pop("$411", DEFAULT_DRM_SCHEME)
if compression_type != DEFAULT_COMPRESSION_TYPE:
log.error(
"Container entity %s has unexpected bcComprType: %s"
% (repr(self), repr(compression_type))
)
if drm_scheme != DEFAULT_DRM_SCHEME:
log.error(
"Container entity %s has unexpected bcDRMScheme: %s"
% (repr(self), repr(drm_scheme))
)
if len(entity_info):
raise Exception(
"Container entity %s info has extra data: %s"
% (repr(self), repr(entity_info))
)
entity_data = cont_entity.extract()
fid = self.symtab.get_symbol(self.id_idnum)
ftype = self.symtab.get_symbol(self.type_idnum)
if ftype in RAW_FRAGMENT_TYPES:
self.value = IonBLOB(entity_data)
else:
self.value = IonBinary(self.symtab).deserialize_single_value(entity_data)
if isinstance(self.value, IonAnnotation):
if self.value.is_annotation(ftype) and fid == "$348":
fid = ftype
self.value = self.value.value
else:
log.error(
"Entity %s has IonAnnotation as value: %s"
% (repr(self), repr(self.value))
)
return YJFragment(
fid=fid if fid != "$348" else None, ftype=ftype, value=self.value
)
def serialize(self):
entity = Serializer()
entity.pack("4s", KfxContainerEntity.SIGNATURE)
entity.pack("<H", KfxContainerEntity.VERSION)
header_len_pack = entity.pack("<L", 0)
entity_info = IonStruct()
entity_info[IS("$410")] = DEFAULT_COMPRESSION_TYPE
entity_info[IS("$411")] = DEFAULT_DRM_SCHEME
entity.append(IonBinary(self.symtab).serialize_single_value(entity_info))
entity.repack(header_len_pack, len(entity))
ftype = self.symtab.get_symbol(self.type_idnum)
if ftype in RAW_FRAGMENT_TYPES:
if isinstance(self.value, IonBLOB):
entity.append(bytes(self.value))
else:
raise Exception(
"KfxContainerEntity %s must be IonBLOB, found %s"
% (ftype, type_name(self.value))
)
else:
entity.append(IonBinary(self.symtab).serialize_single_value(self.value))
return entity.serialize()
def __repr__(self):
return "$%d/$%d" % (self.type_idnum, self.id_idnum)

View File

@@ -0,0 +1,693 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import copy
import decimal
import re
import uuid
from .ion import (
IS,
IonAnnotation,
IonFloat,
IonList,
IonSExp,
IonString,
IonStruct,
IonSymbol,
ion_type,
isstring,
unannotated,
)
from .message_logging import log
from .python_transition import IS_PYTHON2
from .utilities import font_file_ext
from .yj_container import YJFragment, YJFragmentKey
from .yj_structure import EID_REFERENCES, FORMAT_SYMBOLS, MAX_CONTENT_FRAGMENT_SIZE
from .yj_versions import GENERIC_CREATOR_VERSIONS, is_known_aux_metadata
if IS_PYTHON2:
from .python_transition import str
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
FIX_BOOK = True
CREATE_CONTENT_FRAGMENTS = True
VERIFY_ORIGINAL_POSITION_MAP = False
SHORT_TOOL_NAME = {
"Kindle Previewer 3": "KPR",
"Kindle Create": "KC",
}
class KpfBook(object):
def fix_kpf_prepub_book(self, fix_book, retain_yj_locals):
self.retain_yj_locals = retain_yj_locals
if len(self.yj_containers) != 1:
raise Exception("A KPF book should have only one container")
self.kpf_container = self.yj_containers[0]
if not (fix_book and FIX_BOOK):
return
for fragment in self.fragments.get_all("$417"):
orig_fid = fragment.fid
fixed_fid = fix_resource_location(orig_fid)
if fixed_fid != orig_fid:
self.fragments.remove(fragment)
self.fragments.append(
YJFragment(
ftype="$417",
fid=self.create_local_symbol(fixed_fid),
value=fragment.value,
)
)
for fragment in list(self.fragments):
if fragment.ftype != "$270":
self.kpf_fix_fragment(fragment)
for fragment in self.fragments.get_all("$266"):
if fragment.value.get("$183", {}).get("$143", None) == 0:
fragment.value["$183"].pop("$143")
fragment = self.fragments.get("$550")
if fragment is not None:
for lm in fragment.value:
lm.pop("$178", None)
fragment = self.fragments.get("$490")
if fragment is not None:
for category in ["kindle_audit_metadata", "kindle_title_metadata"]:
for cm in fragment.value["$491"]:
if cm["$495"] == category:
break
else:
fragment.value["$491"].append(
IonStruct(IS("$495"), category, IS("$258"), [])
)
for cm in fragment.value["$491"]:
if cm["$495"] == "kindle_audit_metadata":
if (
(
self.get_metadata_value(
"file_creator", category="kindle_audit_metadata"
),
self.get_metadata_value(
"creator_version", category="kindle_audit_metadata"
),
)
in GENERIC_CREATOR_VERSIONS
) and self.kpf_container.kcb_data:
kcb_metadata = self.kpf_container.kcb_data.get("metadata", {})
tool_name = kcb_metadata.get("tool_name")
tool_version = kcb_metadata.get("tool_version")
if (
tool_name
and tool_version
and not tool_version.startswith("unknown")
):
for metadata in cm["$258"]:
if metadata["$492"] == "file_creator":
metadata["$307"] = SHORT_TOOL_NAME.get(
tool_name, tool_name
)
if metadata["$492"] == "creator_version":
metadata["$307"] = tool_version
elif cm["$495"] == "kindle_title_metadata":
if self.get_metadata_value("asset_id") is None:
cm["$258"].append(
IonStruct(
IS("$492"),
"asset_id",
IS("$307"),
self.create_container_id(),
)
)
if self.get_metadata_value("is_sample") is None:
cm["$258"].append(
IonStruct(IS("$492"), "is_sample", IS("$307"), False)
)
if (
self.get_metadata_value("language", default="")
.lower()
.startswith("ja-zh")
):
for metadata in cm["$258"]:
if metadata["$492"] == "language":
metadata["$307"] = metadata["$307"][3:].replace(
"=", "-"
)
if (
self.kpf_container.source_epub is not None
and len(self.kpf_container.source_epub.authors) > 1
):
for i, md in reversed(list(enumerate(cm["$258"]))):
if md["$492"] == "author":
cm["$258"].pop(i)
for author in self.kpf_container.source_epub.authors:
cm["$258"].append(
IonStruct(IS("$492"), "author", IS("$307"), author)
)
if (
self.kpf_container.source_epub.issue_date
and self.get_metadata_value("issue_date") is None
):
cm["$258"].append(
IonStruct(
IS("$492"),
"issue_date",
IS("$307"),
self.kpf_container.source_epub.issue_date,
)
)
if self.get_metadata_value("override_kindle_font") is None:
cm["$258"].append(
IonStruct(
IS("$492"), "override_kindle_font", IS("$307"), False
)
)
if (
self.get_metadata_value("cover_image") is None
and self.get_metadata_value(
"yj_fixed_layout", category="kindle_capability_metadata"
)
is not None
):
cover_resource = self.locate_cover_image_resource_from_content()
if cover_resource is not None:
cm["$258"].append(
IonStruct(
IS("$492"),
"cover_image",
IS("$307"),
str(cover_resource),
)
)
for fragment in self.fragments.get_all("$262"):
if fragment.fid != "$262":
self.fragments.remove(fragment)
self.fragments.append(YJFragment(ftype="$262", value=fragment.value))
location = fragment.value["$165"]
font_data_fragment = self.fragments[
YJFragmentKey(ftype="$417", fid=location)
]
self.fragments.remove(font_data_fragment)
self.fragments.append(
YJFragment(
ftype="$418",
fid=self.create_local_symbol(location),
value=font_data_fragment.value,
)
)
for fragment in self.fragments.get_all("$164"):
fv = fragment.value
if (
fv.get("$161") == "$287"
and "$422" not in fv
and "$423" not in fv
and "$167" in fv
):
referred_resources = fv["$167"]
for frag in self.fragments.get_all("$164"):
if (
frag.fid in referred_resources
and "$422" in frag.value
and "$423" in frag.value
):
fv[IS("$422")] = frag.value["$422"]
fv[IS("$423")] = frag.value["$423"]
break
if fv.get("$162") == "":
fv.pop("$162")
log.warning(
"Removed empty mime type from external_resource %s" % fv.get("$175")
)
cover_image_data = self.get_cover_image_data()
if cover_image_data is not None:
new_cover_image_data = self.fix_cover_image_data(cover_image_data)
if new_cover_image_data != cover_image_data:
self.set_cover_image_data(new_cover_image_data)
canonical_format = (2, 0) if self.is_illustrated_layout else (1, 0)
file_creator = self.get_metadata_value(
"file_creator", category="kindle_audit_metadata", default=""
)
creator_version = self.get_metadata_value(
"creator_version", category="kindle_audit_metadata", default=""
)
if (
file_creator == "KC"
or (file_creator == "KTC" and creator_version >= "1.11")
) and canonical_format < (2, 0):
canonical_format = (2, 0)
content_features = self.fragments.get("$585")
if content_features is not None:
content_features.value.pop("$155", None)
content_features.value.pop("$598", None)
else:
content_features = YJFragment(ftype="$585", value=IonStruct(IS("$590"), []))
self.fragments.append(content_features)
features = content_features.value["$590"]
def add_feature(feature, namespace="com.amazon.yjconversion", version=(1, 0)):
if self.get_feature_value(feature, namespace=namespace) is None:
features.append(
IonStruct(
IS("$586"),
namespace,
IS("$492"),
feature,
IS("$589"),
IonStruct(
IS("version"),
IonStruct(IS("$587"), version[0], IS("$588"), version[1]),
),
)
)
def add_feature_from_metadata(
metadata,
feature,
category="kindle_capability_metadata",
namespace="com.amazon.yjconversion",
version=(1, 0),
):
if self.get_metadata_value(metadata, category=category) is not None:
add_feature(feature, namespace, version)
add_feature("CanonicalFormat", namespace="SDK.Marker", version=canonical_format)
if self.is_fixed_layout:
if self.has_pdf_resource:
add_feature("yj_pdf_support")
add_feature_from_metadata("yj_fixed_layout", "yj_fixed_layout")
else:
add_feature_from_metadata(
"yj_fixed_layout", "yj_non_pdf_fixed_layout", version=2
)
has_hdv_image = has_tiles = yj_jpg_rst_marker_present = False
for fragment in self.fragments.get_all("$164"):
fv = fragment.value
if fv.get("$422", 0) > 1920 or fv.get("$423", 0) > 1920 or "$636" in fv:
has_hdv_image = True
if IS("$797") in fv:
has_tiles = True
if (not yj_jpg_rst_marker_present) and fv.get("$161") == "$285":
location = fv.get("$165", None)
if location is not None:
raw_media = self.fragments.get(
ftype="$417", fid=location, first=True
)
if raw_media is not None:
if re.search(b"\xff[\xd0-\xd7]", raw_media.value.tobytes()):
yj_jpg_rst_marker_present = True
if not self.is_fixed_layout:
if has_tiles:
add_feature("yj_hdv", (2, 0))
elif has_hdv_image:
add_feature("yj_hdv")
if yj_jpg_rst_marker_present:
add_feature("yj_jpg_rst_marker_present")
add_feature_from_metadata("graphical_highlights", "yj_graphical_highlights")
add_feature_from_metadata("yj_textbook", "yj_textbook")
if self.fragments.get("$389") is None:
log.info("Adding book_navigation")
book_navigation = []
for reading_order_name in self.reading_order_names():
book_nav = IonStruct()
if reading_order_name:
book_nav[IS("$178")] = reading_order_name
book_nav[IS("$392")] = []
book_navigation.append(book_nav)
self.fragments.append(YJFragment(ftype="$389", value=book_navigation))
for book_navigation in self.fragments["$389"].value:
pages = []
nav_containers = book_navigation["$392"]
has_page_list = False
for nav_container in nav_containers:
nav_container = unannotated(nav_container)
nav_type = nav_container.get("$235", None)
if nav_type == "$236":
entries = nav_container.get("$247", [])
i = 0
while i < len(entries):
entry = unannotated(entries[i])
label = entry.get("$241", {}).get("$244", "")
if label.startswith("page_list_entry:"):
seq, sep, text = label.partition(":")[2].partition(":")
pages.append(
(
int(seq),
IonAnnotation(
[IS("$393")],
IonStruct(
IS("$241"),
IonStruct(IS("$244"), text),
IS("$246"),
entry["$246"],
),
),
)
)
entries.pop(i)
i -= 1
i += 1
elif nav_type == "$237":
log.info("KPF book contains a page list")
has_page_list = True
if pages and not has_page_list:
log.info(
"Transformed %d KFX landmark entries into a page list" % len(pages)
)
nav_containers.append(
IonAnnotation(
[IS("$391")],
IonStruct(
IS("$235"),
IS("$237"),
IS("$239"),
self.kpf_gen_uuid_symbol(),
IS("$247"),
[p[1] for p in sorted(pages)],
),
)
)
if self.is_dictionary:
self.is_kpf_prepub = False
else:
has_text_block = False
if CREATE_CONTENT_FRAGMENTS:
content_fragment_data = {}
for section_name in self.ordered_section_names():
for story_name in self.extract_section_story_names(section_name):
self.kpf_collect_content_strings(
story_name, content_fragment_data
)
for content_name, content_list in content_fragment_data.items():
has_text_block = True
self.fragments.append(
YJFragment(
ftype="$145",
fid=content_name,
value=IonStruct(
IS("name"), content_name, IS("$146"), content_list
),
)
)
else:
log.warning("Content fragment creation is disabled")
map_pos_info = self.collect_position_map_info()
if VERIFY_ORIGINAL_POSITION_MAP:
content_pos_info = self.collect_content_position_info()
self.verify_position_info(content_pos_info, map_pos_info)
if len(map_pos_info) < 10 and self.is_illustrated_layout:
log.warning("creating position map (original is missing or incorrect)")
map_pos_info = self.collect_content_position_info()
self.is_kpf_prepub = False
has_spim, has_position_id_offset = self.create_position_map(map_pos_info)
has_yj_location_pid_map = False
if self.fragments.get("$550") is None and not (
self.is_print_replica or self.is_magazine
):
loc_info = self.generate_approximate_locations(map_pos_info)
has_yj_location_pid_map = self.create_location_map(loc_info)
if self.fragments.get("$395") is None:
self.fragments.append(
YJFragment(ftype="$395", value=IonStruct(IS("$247"), []))
)
for fragment in self.fragments.get_all("$593"):
self.fragments.remove(fragment)
fc = []
if has_spim or has_yj_location_pid_map:
fc.append(
IonStruct(IS("$492"), "kfxgen.positionMaps", IS("version"), 2)
)
if has_position_id_offset:
fc.append(
IonStruct(IS("$492"), "kfxgen.pidMapWithOffset", IS("version"), 1)
)
if has_text_block:
fc.append(IonStruct(IS("$492"), "kfxgen.textBlock", IS("version"), 1))
self.fragments.append(YJFragment(ftype="$593", value=fc))
for fragment in self.fragments.get_all("$597"):
for kv in fragment.value.get("$258", []):
key = kv.get("$492", "")
value = kv.get("$307", "")
if not is_known_aux_metadata(key, value):
log.warning("Unknown auxiliary_data: %s=%s" % (key, value))
self.check_fragment_usage(rebuild=True, ignore_extra=True)
self.check_symbol_table(rebuild=True, ignore_unused=True)
def kpf_gen_uuid_symbol(self):
return self.create_local_symbol(str(uuid.uuid4()))
def kpf_fix_fragment(self, fragment):
def _fix_ion_data(data, container):
data_type = ion_type(data)
if data_type is IonAnnotation:
if data.is_annotation("$608"):
return _fix_ion_data(data.value, container)
new_annot = [_fix_ion_data(annot, None) for annot in data.annotations]
return IonAnnotation(new_annot, _fix_ion_data(data.value, container))
if data_type is IonList:
new_list = []
for i, fc in enumerate(data):
if container == "$146" and isinstance(fc, IonSymbol):
structure = self.fragments.get(
YJFragmentKey(ftype="$608", fid=fc)
)
if structure is not None:
fc = copy.deepcopy(structure.value)
if (not self.is_dictionary) and (
(
fragment.ftype == "$609"
and container == "contains_list_"
and i == 1
)
or (
fragment.ftype == "$538"
and container == "yj.semantics.containers_with_semantics"
)
):
fc = self.symbol_id(fc)
if container == "$181":
list_container = "contains_list_"
elif container == "$141":
list_container = "$141"
else:
list_container = None
new_list.append(_fix_ion_data(fc, list_container))
return new_list
if data_type is IonSExp:
new_sexp = IonSExp()
for fc in data:
new_sexp.append(_fix_ion_data(fc, None))
return new_sexp
if data_type is IonStruct:
new_struct = IonStruct()
for fk, fv in data.items():
fv = _fix_ion_data(fv, fk)
if not self.is_dictionary:
if fk == "$597":
continue
if fk == "$239":
self.create_local_symbol(str(fv))
if (
fk in EID_REFERENCES
and fragment.ftype != "$597"
and isinstance(fv, IonSymbol)
):
if fk == "$598":
fk = IS("$155")
if (
fragment.ftype != "$610"
or self.fragments.get(ftype="$260", fid=fv) is None
):
fv = self.symbol_id(fv)
if fk == "$161" and isstring(fv):
fv = IS(FORMAT_SYMBOLS[fv])
if (not self.retain_yj_locals) and (
fk.startswith("yj.authoring.")
or fk.startswith("yj.conversion.")
or fk.startswith("yj.print.")
or fk.startswith("yj.semantics.")
or fk == "$790"
):
continue
if (
self.is_illustrated_layout
and fragment.ftype == "$260"
and container == "$141"
and fk in ["$67", "$66"]
):
continue
if fk == "$165":
if ion_type(fv) is not IonString:
raise Exception("location is not IonString: %s" % fv)
fv = fix_resource_location(fv)
if fragment.ftype == "$157" and fk == "$173" and fv != fragment.fid:
log.info(
"Fixing incorrect name %s of style %s" % (fv, fragment.fid)
)
fv = fragment.fid
new_struct[_fix_ion_data(fk, None)] = fv
return new_struct
if data_type is IonFloat:
dec = decimal.Decimal("%g" % data)
if abs(dec) < 0.001:
dec = decimal.Decimal("0")
return dec
return data
fragment.value = _fix_ion_data(fragment.value, None)
def kpf_collect_content_strings(self, story_name, content_fragment_data):
def _kpf_collect_content_strings(data):
data_type = ion_type(data)
if data_type is IonAnnotation:
_kpf_collect_content_strings(data.value)
elif data_type is IonList or data_type is IonSExp:
for fc in data:
_kpf_collect_content_strings(fc)
elif data_type is IonStruct:
for fk, fv in data.items():
if fk == "$145" and isstring(fv):
if (
len(content_fragment_data) == 0
or self._content_fragment_size >= MAX_CONTENT_FRAGMENT_SIZE
):
self._content_fragment_name = self.create_local_symbol(
"content_%d" % (len(content_fragment_data) + 1)
)
content_fragment_data[self._content_fragment_name] = []
self._content_fragment_size = 0
content_fragment_data[self._content_fragment_name].append(fv)
self._content_fragment_size += len(fv.encode("utf8"))
data[fk] = IonStruct(
IS("name"),
self._content_fragment_name,
IS("$403"),
len(content_fragment_data[self._content_fragment_name]) - 1,
)
else:
_kpf_collect_content_strings(fv)
_kpf_collect_content_strings(
self.fragments[YJFragmentKey(ftype="$259", fid=story_name)].value
)
def symbol_id(self, symbol):
if symbol is None or isinstance(symbol, int):
return symbol
return self.symtab.get_id(symbol)
def kpf_add_font_ext(self, filename, raw_font):
ext = font_file_ext(raw_font)
if not ext:
log.warn("font %s has unknown type (possibly obfuscated)" % filename)
return "%s%s" % (filename, ext)
def section_sort_key(reading_order, s):
try:
return (reading_order.index(s), s)
except ValueError:
return (len(reading_order), s)
def fix_resource_location(s):
return s if s.startswith("resource/") else "resource/%s" % s

View File

@@ -0,0 +1,572 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import io
import os
try:
import apsw
have_apsw = True
except ImportError:
import sqlite3
have_apsw = False
from .ion import (
IS,
IonAnnotation,
IonBLOB,
IonInt,
IonList,
IonSExp,
IonString,
IonStruct,
ion_type,
)
from .ion_binary import IonBinary
from .message_logging import log
from .original_source_epub import SourceEpub
from .python_transition import IS_PYTHON2
from .utilities import (
ZIP_SIGNATURE,
DataFile,
Deserializer,
KFXDRMError,
bytes_to_separated_hex,
json_deserialize,
json_serialize,
natural_sort_key,
temp_filename,
)
from .yj_container import (
CONTAINER_FORMAT_KPF,
DRMION_SIGNATURE,
ROOT_FRAGMENT_TYPES,
YJContainer,
YJFragment,
)
from .yj_symbol_catalog import SYSTEM_SYMBOL_TABLE
if IS_PYTHON2:
from .python_transition import repr
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
DEBUG = False
RETAIN_KFX_ID_ANNOT = False
RESOURCE_DIRECTORY = "resources"
DICTIONARY_RULES_FILENAME = "DictionaryRules.ion"
SQLITE_SIGNATURE = b"SQLite format 3\0"
class KpfContainer(YJContainer):
KPF_SIGNATURE = ZIP_SIGNATURE
KDF_SIGNATURE = SQLITE_SIGNATURE
db_timeout = 30
def __init__(self, symtab, datafile=None, fragments=None, book=None):
YJContainer.__init__(self, symtab, datafile=datafile, fragments=fragments)
self.book = book
def deserialize(self, ignore_drm=False):
self.ignore_drm = ignore_drm
self.fragments.clear()
self.kpf_datafile = (
self.kdf_datafile
) = self.kcb_datafile = self.kcb_data = self.source_epub = None
if self.datafile.is_zipfile():
self.kpf_datafile = self.datafile
with self.kpf_datafile.as_ZipFile() as zf:
for info in zf.infolist():
ext = os.path.splitext(info.filename)[1]
if ext == ".kdf":
self.kdf_datafile = DataFile(
info.filename, zf.read(info), self.kpf_datafile
)
elif ext == ".kdf-journal":
if len(zf.read(info)) > 0:
raise Exception(
"kdf-journal is not empty in %s"
% self.kpf_datafile.name
)
elif ext == ".kcb":
self.kcb_datafile = DataFile(
info.filename, zf.read(info), self.kpf_datafile
)
self.kcb_data = json_deserialize(self.kcb_datafile.get_data())
if self.kdf_datafile is None:
raise Exception("Failed to locate KDF within %s" % self.datafile.name)
else:
self.kdf_datafile = self.datafile
unwrapped_kdf_datafile = SQLiteFingerprintWrapper(self.kdf_datafile).remove()
db_filename = (
unwrapped_kdf_datafile.name
if unwrapped_kdf_datafile.is_real_file and not self.book.is_netfs
else temp_filename("kdf", unwrapped_kdf_datafile.get_data())
)
if have_apsw:
if natural_sort_key(apsw.sqlitelibversion()) < natural_sort_key("3.8.2"):
raise Exception(
"SQLite version 3.8.2 or later is necessary in order to use a WITHOUT ROWID table. Found version %s"
% apsw.sqlitelibversion()
)
conn = apsw.Connection(db_filename)
else:
if sqlite3.sqlite_version_info < (3, 8, 2):
raise Exception(
"SQLite version 3.8.2 or later is necessary in order to use a WITHOUT ROWID table. Found version %s"
% sqlite3.sqlite_version
)
conn = sqlite3.connect(db_filename, KpfContainer.db_timeout)
cursor = conn.cursor()
sql_list = cursor.execute(
"SELECT sql FROM sqlite_master WHERE type='table';"
).fetchall()
schema = set([x[0] for x in sql_list])
dictionary_index_terms = set()
first_head_word = ""
INDEX_INFO_SCHEMA = (
"CREATE TABLE index_info(namespace char(256), index_name char(256), property char(40), "
"primary key (namespace, index_name)) without rowid"
)
if INDEX_INFO_SCHEMA in schema:
schema.remove(INDEX_INFO_SCHEMA)
self.book.is_dictionary = True
for namespace, index_name, property in cursor.execute(
"SELECT * FROM index_info;"
):
if namespace != "dictionary" or property != "yj.dictionary.term":
log.error(
"unexpected index_info: namespace=%s, index_name=%s, property=%s"
% (namespace, index_name, property)
)
table_name = "index_%s_%s" % (namespace, index_name)
index_schema = (
"CREATE TABLE %s ([%s] char(256), id char(40), "
"primary key ([%s], id)) without rowid"
) % (table_name, property, property)
if index_schema in schema:
schema.remove(index_schema)
num_entries = 0
index_words = set()
index_kfx_ids = set()
for dictionary_term, kfx_id in cursor.execute(
"SELECT * FROM %s;" % table_name
):
num_entries += 1
dictionary_index_terms.add((dictionary_term, IS(kfx_id)))
index_words.add(dictionary_term)
index_kfx_ids.add(kfx_id)
if dictionary_term < first_head_word or not first_head_word:
first_head_word = dictionary_term
log.info(
"Dictionary %s table has %d entries with %d terms and %d definitions"
% (
table_name,
num_entries,
len(index_words),
len(index_kfx_ids),
)
)
else:
log.error("KPF database is missing the '%s' table" % table_name)
self.eid_symbol = {}
KFXID_TRANSLATION_SCHEMA = "CREATE TABLE kfxid_translation(eid INTEGER, kfxid char(40), primary key(eid)) without rowid"
if KFXID_TRANSLATION_SCHEMA in schema:
schema.remove(KFXID_TRANSLATION_SCHEMA)
for eid, kfx_id in cursor.execute("SELECT * FROM kfxid_translation;"):
self.eid_symbol[eid] = self.create_local_symbol(kfx_id)
self.element_type = {}
FRAGMENT_PROPERTIES_SCHEMA = (
"CREATE TABLE fragment_properties(id char(40), key char(40), value char(40), "
"primary key (id, key, value)) without rowid"
)
if FRAGMENT_PROPERTIES_SCHEMA in schema:
schema.remove(FRAGMENT_PROPERTIES_SCHEMA)
for id, key, value in cursor.execute("SELECT * FROM fragment_properties;"):
if key == "child":
pass
elif key == "element_type":
self.element_type[id] = value
else:
log.error(
"fragment_property has unknown key: id=%s key=%s value=%s"
% (id, key, value)
)
self.max_eid_in_sections = None
FRAGMENTS_SCHEMA = "CREATE TABLE fragments(id char(40), payload_type char(10), payload_value blob, primary key (id))"
if FRAGMENTS_SCHEMA in schema:
schema.remove(FRAGMENTS_SCHEMA)
for id in ["$ion_symbol_table", "max_id"]:
rows = cursor.execute(
"SELECT payload_value FROM fragments WHERE id = ? AND payload_type = 'blob';",
(id,),
).fetchall()
if rows:
payload_data = self.prep_payload_blob(rows[0][0])
if payload_data is None:
pass
elif id == "$ion_symbol_table":
self.symtab.creating_yj_local_symbols = True
sym_import = IonBinary(self.symtab).deserialize_annotated_value(
payload_data,
expect_annotation="$ion_symbol_table",
import_symbols=True,
)
self.symtab.creating_yj_local_symbols = False
if DEBUG:
log.info(
"kdf symbol import = %s" % json_serialize(sym_import)
)
self.fragments.append(YJFragment(sym_import))
break
else:
max_id = IonBinary(self.symtab).deserialize_single_value(
payload_data
)
if DEBUG:
log.info("kdf max_id = %d" % max_id)
self.symtab.clear()
self.symtab.import_shared_symbol_table(
"YJ_symbols",
max_id=max_id - len(SYSTEM_SYMBOL_TABLE.symbols),
)
self.fragments.append(YJFragment(self.symtab.create_import()))
for id, payload_type, payload_value in cursor.execute(
"SELECT * FROM fragments;"
):
ftype = id
if payload_type == "blob":
payload_data = self.prep_payload_blob(payload_value)
if id in ["max_id", "$ion_symbol_table"]:
pass
elif payload_data is None:
ftype = self.element_type.get(id)
elif id == "max_eid_in_sections":
ftype = None
self.max_eid_in_sections = IonBinary(
self.symtab
).deserialize_single_value(payload_data)
if self.book.is_dictionary:
pass
else:
log.warning(
"Unexpected max_eid_in_sections for non-dictionary: %d"
% self.max_eid_in_sections
)
elif not payload_data.startswith(IonBinary.SIGNATURE):
ftype = None
self.fragments.append(
YJFragment(
ftype="$417",
fid=self.create_local_symbol(id),
value=IonBLOB(payload_data),
)
)
elif len(payload_data) == len(IonBinary.SIGNATURE):
if id != "book_navigation":
log.warning("Ignoring empty %s fragment" % id)
else:
value = IonBinary(self.symtab).deserialize_annotated_value(
payload_data
)
if not isinstance(value, IonAnnotation):
log.error(
"KDF fragment id=%s is missing annotation: %s"
% (id, repr(value))
)
continue
elif (
len(value.annotations) == 2
and value.annotations[1] == "$608"
):
pass
elif len(value.annotations) > 1:
log.error(
"KDF fragment should have one annotation: %s"
% repr(value)
)
ftype = value.annotations[0]
if (
ftype in ROOT_FRAGMENT_TYPES
): # shortcut when symbol table unavailable
fid = None
else:
fid = self.create_local_symbol(id)
self.fragments.append(
YJFragment(
ftype=ftype,
fid=fid,
value=self.deref_kfx_ids(value.value),
)
)
elif payload_type == "path":
ftype = "$417"
resource_data = self.get_resource_data(
self.prep_payload_blob(payload_value).decode("utf8")
)
if resource_data is not None:
self.fragments.append(
YJFragment(
ftype=ftype,
fid=self.create_local_symbol(id),
value=IonBLOB(resource_data),
)
)
else:
log.error(
"Unexpected KDF payload_type=%s, id=%s, value=%d bytes"
% (payload_type, id, len(payload_value))
)
else:
log.error("KPF database is missing the 'fragments' table")
GC_FRAGMENT_PROPERTIES_SCHEMA = (
"CREATE TABLE gc_fragment_properties(id varchar(40), key varchar(40), "
"value varchar(40), primary key (id, key, value)) without rowid"
)
if GC_FRAGMENT_PROPERTIES_SCHEMA in schema:
schema.remove(GC_FRAGMENT_PROPERTIES_SCHEMA)
GC_REACHABLE_SCHEMA = (
"CREATE TABLE gc_reachable(id varchar(40), primary key (id)) without rowid"
)
if GC_REACHABLE_SCHEMA in schema:
schema.remove(GC_REACHABLE_SCHEMA)
CAPABILITIES_SCHEMA = "CREATE TABLE capabilities(key char(20), version smallint, primary key (key, version)) without rowid"
if CAPABILITIES_SCHEMA in schema:
schema.remove(CAPABILITIES_SCHEMA)
capabilities = cursor.execute("SELECT * FROM capabilities;").fetchall()
if capabilities:
format_capabilities = [
IonStruct(IS("$492"), key, IS("version"), version)
for key, version in capabilities
]
self.fragments.append(
YJFragment(ftype="$593", value=format_capabilities)
)
else:
log.error("KPF database is missing the 'capabilities' table")
if len(schema) > 0:
for s in list(schema):
log.error("Unexpected KDF database schema: %s" % s)
cursor.close()
conn.close()
self.book.is_kpf_prepub = True
book_metadata_fragment = self.fragments.get("$490")
if book_metadata_fragment is not None:
for cm in book_metadata_fragment.value.get("$491", {}):
if cm.get("$495", "") == "kindle_title_metadata":
for kv in cm.get("$258", []):
if kv.get("$492", "") in [
"ASIN",
"asset_id",
"cde_content_type",
"content_id",
]:
self.book.is_kpf_prepub = False
break
break
self.fragments.append(
YJFragment(
ftype="$270",
value=IonStruct(
IS("$587"), "", IS("$588"), "", IS("$161"), CONTAINER_FORMAT_KPF
),
)
)
if self.kcb_datafile is not None and self.kcb_data is not None:
source_path = self.kcb_data.get("metadata", {}).get("source_path")
if source_path and os.path.splitext(source_path)[1] in [".epub", ".zip"]:
epub_file = self.get_kpf_file(source_path)
if epub_file is not None:
zip_file = io.BytesIO(epub_file.get_data())
self.source_epub = SourceEpub(zip_file)
zip_file.close()
def prep_payload_blob(self, data):
data = io.BytesIO(data).read()
if not data.startswith(DRMION_SIGNATURE):
return data
if self.ignore_drm:
return None
raise KFXDRMError("Book container has DRM and cannot be converted")
def create_local_symbol(self, symbol):
return self.book.create_local_symbol(symbol)
def get_resource_data(self, filename, report_missing=True):
try:
resource_datafile = self.kdf_datafile.relative_datafile(filename)
return resource_datafile.get_data()
except Exception:
if report_missing:
log.error("Missing resource in KPF: %s" % filename)
return None
def get_kpf_file(self, filename, report_missing=True):
try:
return self.kcb_datafile.relative_datafile(filename)
except Exception:
if report_missing:
log.error("Missing file in KPF: %s" % filename)
return None
def deref_kfx_ids(self, data):
def process(data):
data_type = ion_type(data)
if data_type is IonAnnotation:
if data.is_annotation("$598"):
val = data.value
val_type = ion_type(val)
if val_type is IonString:
return self.create_local_symbol(val)
elif val_type is IonInt:
value = self.eid_symbol.get(val)
if value is not None:
return value
else:
log.error("Undefined kfx_id annotation eid: %d" % val)
else:
log.error(
"Unexpected data type for kfx_id annotation: %s" % val_type
)
return val
process(data.value)
if data_type is IonList or data_type is IonSExp:
for i, val in enumerate(list(data)):
new_val = process(val)
if new_val is not None:
data.pop(i)
data.insert(i, new_val)
if data_type is IonStruct:
for key, val in data.items():
new_val = process(val)
if new_val is not None:
data[key] = new_val
return None
if not RETAIN_KFX_ID_ANNOT:
process(data)
return data
class SQLiteFingerprintWrapper(object):
FINGERPRINT_OFFSET = 1024
FINGERPRINT_RECORD_LEN = 1024
DATA_RECORD_LEN = 1024
DATA_RECORD_COUNT = 1024
FINGERPRINT_SIGNATURE = b"\xfa\x50\x0a\x5f"
def __init__(self, datafile):
self.datafile = datafile
def remove(self):
data = self.datafile.get_data()
if (
len(data) < self.FINGERPRINT_OFFSET + self.FINGERPRINT_RECORD_LEN
or data[
self.FINGERPRINT_OFFSET : self.FINGERPRINT_OFFSET
+ len(self.FINGERPRINT_SIGNATURE)
]
!= self.FINGERPRINT_SIGNATURE
):
return self.datafile
fingerprint_count = 0
data_offset = self.FINGERPRINT_OFFSET
while len(data) >= data_offset + self.FINGERPRINT_RECORD_LEN:
fingerprint = Deserializer(
data[data_offset : data_offset + self.FINGERPRINT_RECORD_LEN]
)
signature = fingerprint.extract(4)
if signature != self.FINGERPRINT_SIGNATURE:
log.error(
"Unexpected fingerprint %d signature: %s"
% (fingerprint_count, bytes_to_separated_hex(signature))
)
return self.datafile
data = (
data[:data_offset] + data[data_offset + self.FINGERPRINT_RECORD_LEN :]
)
fingerprint_count += 1
data_offset += self.DATA_RECORD_LEN * self.DATA_RECORD_COUNT
log.info("Removed %d KDF SQLite file fingerprint(s)" % fingerprint_count)
return DataFile(self.datafile.name + "-unwrapped", data)

View File

@@ -0,0 +1,33 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import threading
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
thread_local_cfg = threading.local()
def set_logger(logger=None):
global thread_local_cfg
if log is not None:
thread_local_cfg.logger = logger
else:
del thread_local_cfg.logger
return logger
def get_current_logger():
return getattr(thread_local_cfg, "logger", logging)
class LogCurrent(object):
def __getattr__(self, method_name):
return getattr(get_current_logger(), method_name)
log = LogCurrent()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,141 @@
from __future__ import absolute_import, division, print_function, unicode_literals
"""
from .python_transition import (IS_PYTHON2, bytes_, bytes_indexed, bytes_to_hex, bytes_to_list)
if IS_PYTHON2:
from .python_transition import (chr, html, http, repr, str, urllib)
else:
import html
import html.parser
import html.entities
import http.client
import http.cookiejar
import urllib.request
import urllib.parse
"""
import sys
IS_PYTHON2 = sys.version_info[0] == 2
if IS_PYTHON2:
import cgi
from urllib import quote, quote_plus, unquote, urlencode
import cookielib
import htmlentitydefs
import HTMLParser
import httplib
from urllib2 import (
HTTPCookieProcessor,
HTTPError,
HTTPHandler,
HTTPRedirectHandler,
HTTPSHandler,
Request,
build_opener,
)
from urlparse import parse_qs, urljoin, urlparse, urlunparse
class Object(object):
pass
html = Object()
html.entities = htmlentitydefs
html.escape = cgi.escape
html.parser = HTMLParser
html.unescape = HTMLParser.HTMLParser().unescape
http = Object()
http.client = httplib
http.cookiejar = cookielib
parse = Object()
parse.parse_qs = parse_qs
parse.quote = quote
parse.quote_plus = quote_plus
parse.unquote = unquote
parse.urlencode = urlencode
parse.urljoin = urljoin
parse.urlparse = urlparse
parse.urlunparse = urlunparse
request = Object()
request.build_opener = build_opener
request.HTTPCookieProcessor = HTTPCookieProcessor
request.HTTPError = HTTPError
request.HTTPHandler = HTTPHandler
request.HTTPSHandler = HTTPSHandler
request.HTTPRedirectHandler = HTTPRedirectHandler
request.Request = Request
urllib = Object()
urllib.parse = parse
urllib.request = request
try:
unicode
unichr
except NameError:
unicode = unichr = None
py2_chr = chr
str = unicode
chr = unichr
def repr(obj):
return obj.__repr__()
class bytes_(bytes):
def __new__(cls, x):
if isinstance(x, bytes):
return x
if isinstance(x, bytearray):
return bytes(x)
if isinstance(x, int):
return b"\x00" * x
if isinstance(x, list):
return b"".join(py2_chr(i) for i in x)
raise TypeError("Cannot convert %s to bytes" % type(x).__name__)
@staticmethod
def fromhex(s):
if not isinstance(s, str):
raise TypeError("fromhex %s" % type(s).__name__)
return s.decode("hex")
def bytes_indexed(b, i):
if not isinstance(b, bytes):
raise TypeError("bytes_indexed %s" % type(b).__name__)
return ord(b[i])
def bytes_to_hex(b):
if not isinstance(b, bytes):
raise TypeError("bytes_to_hex %s" % type(b).__name__)
return b.encode("hex").decode("ascii")
def bytes_to_list(b):
if not isinstance(b, bytes):
raise TypeError("bytes_to_list %s" % type(b).__name__)
return [ord(c) for c in list(b)]
else:
bytes_ = bytes
def bytes_indexed(b, i):
return b[i]
def bytes_to_hex(b):
return b.hex()
def bytes_to_list(data):
return list(data)

View File

@@ -0,0 +1,192 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import io
import posixpath
import zipfile
from .ion import IonAnnotation, IonBLOB
from .ion_text import IonText
from .message_logging import log
from .utilities import (
EXTS_OF_MIMETYPE,
DataFile,
font_file_ext,
image_file_ext,
json_serialize,
type_name,
)
from .yj_container import YJContainer, YJFragment
from .yj_structure import SYMBOL_FORMATS
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
class IonTextContainer(YJContainer):
def deserialize(self, ignore_drm=False):
self.fragments.clear()
for annot in IonText(self.symtab).deserialize_multiple_values(
self.datafile.get_data(), import_symbols=True
):
if not isinstance(annot, IonAnnotation):
raise Exception(
"deserialize kfx ion text expected IonAnnotation but found %s"
% type_name(annot)
)
self.fragments.append(YJFragment(annot))
def serialize(self):
return IonText(self.symtab).serialize_multiple_values(self.get_fragments())
class ZipUnpackContainer(YJContainer):
ADDED_EXT_FLAG_CHAR = "."
def deserialize(self, ignore_drm=False):
with self.datafile.as_ZipFile() as zf:
for info in zf.infolist():
if info.filename == "book.ion":
IonTextContainer(
self.symtab,
datafile=DataFile(info.filename, data=zf.read(info)),
fragments=self.fragments,
).deserialize()
break
else:
raise Exception("book.ion file missing from ZipUnpackContainer")
fonts = set()
for fragment in self.fragments:
if fragment.ftype == "$262":
fonts.add(fragment.value.get("$165"))
for info in zf.infolist():
if info.filename != "book.ion" and not info.filename.endswith("/"):
fn, ext = posixpath.splitext(info.filename)
fid = (
fn[:-1]
if ext and fn.endswith(self.ADDED_EXT_FLAG_CHAR)
else info.filename
)
self.fragments.append(
YJFragment(
ftype=("$418" if fid in fonts else "$417"),
fid=fid,
value=IonBLOB(zf.read(info)),
)
)
def serialize(self):
desired_extension = {}
for fragment in self.fragments.get_all("$164"):
location = fragment.value.get("$165", "")
extension = posixpath.splitext(location)[1]
if not extension:
format = fragment.value.get("$161")
if format in SYMBOL_FORMATS:
extension = "." + SYMBOL_FORMATS[format]
if extension in ["", ".pobject"]:
mime = fragment.value.get("$162")
if mime in EXTS_OF_MIMETYPE and mime != "figure":
extension = EXTS_OF_MIMETYPE[mime][0]
if extension:
if location:
desired_extension[location] = extension
if "$636" in fragment.value:
for tile_row in fragment.value["$636"]:
for tile_location in tile_row:
desired_extension[tile_location] = extension
zfile = io.BytesIO()
with zipfile.ZipFile(zfile, "w", compression=zipfile.ZIP_DEFLATED) as zf:
zf.writestr(
"book.ion",
IonTextContainer(
self.symtab, fragments=self.fragments.filtered(omit_resources=True)
).serialize(),
)
for ftype in ["$417", "$418"]:
for fragment in self.fragments.get_all(ftype):
fn = fragment.fid.tostring()
if not posixpath.splitext(fn)[1]:
if ftype == "$417":
if fn in desired_extension:
fn += self.ADDED_EXT_FLAG_CHAR + desired_extension[fn]
else:
extension = image_file_ext(fragment.value)
if extension:
fn += self.ADDED_EXT_FLAG_CHAR + extension
else:
extension = font_file_ext(fragment.value)
if extension:
fn += self.ADDED_EXT_FLAG_CHAR + extension
zf.writestr(fn, bytes(fragment.value))
data = zfile.getvalue()
zfile.close()
return data
class JsonContentContainer(object):
VERSION = "1.1"
TYPE_TEXT = 1
TYPE_IMAGE = 2
TYPE_HTML = 8
def __init__(self, book):
self.book = book
def serialize(self):
content_pos_info = self.book.collect_content_position_info()
data = []
next_pid = 0
for chunk in content_pos_info:
if chunk.pid != next_pid:
log.error(
"next PID is %d but expected %d: %s"
% (chunk.pid, next_pid, repr(chunk))
)
next_pid = chunk.pid
if chunk.text is not None:
if len(chunk.text) != chunk.length:
log.error(
"chunk length %d but have %d characters: %s"
% (chunk.length, len(chunk.text), repr(chunk))
)
entry = {}
entry["content"] = chunk.text
entry["position"] = chunk.pid
entry["type"] = self.TYPE_TEXT
data.append(entry)
elif chunk.image_resource is not None:
if chunk.length != 1:
log.error(
"chunk length %d for image: %s" % (chunk.length, repr(chunk))
)
entry = {}
entry["content"] = chunk.image_resource
entry["position"] = chunk.pid
entry["type"] = self.TYPE_IMAGE
data.append(entry)
next_pid += chunk.length
content = {"data": data, "version": self.VERSION}
return json_serialize(content, sort_keys=True, indent=2).encode("utf-8")

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,3 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__version__ = "20220215"

View File

@@ -0,0 +1,378 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import posixpath
import traceback
from .ion_symbol_table import LocalSymbolTable, SymbolTableCatalog
from .ion_text import IonText
from .kfx_container import MAX_KFX_CONTAINER_SIZE, KfxContainer
from .kpf_book import KpfBook
from .kpf_container import KpfContainer
from .message_logging import log
from .python_transition import IS_PYTHON2
from .unpack_container import IonTextContainer, JsonContentContainer, ZipUnpackContainer
from .utilities import (
ZIP_SIGNATURE,
DataFile,
KFXDRMError,
bytes_to_separated_hex,
file_read_utf8,
flush_unicode_cache,
temp_file_cleanup,
)
from .yj_container import YJFragmentList
from .yj_metadata import BookMetadata
from .yj_position_location import BookPosLoc
from .yj_structure import BookStructure
from .yj_symbol_catalog import YJ_SYMBOLS, IonSharedSymbolTable
if IS_PYTHON2:
from .python_transition import repr
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
class YJ_Book(BookStructure, BookPosLoc, BookMetadata, KpfBook):
def __init__(
self, file, credentials=[], is_netfs=False, symbol_catalog_filename=None
):
self.datafile = DataFile(file)
self.credentials = credentials
self.is_netfs = is_netfs
self.symbol_catalog_filename = symbol_catalog_filename
self.reported_errors = set()
self.symtab = LocalSymbolTable(YJ_SYMBOLS.name)
self.fragments = YJFragmentList()
self.reported_missing_fids = set()
self.is_kpf_prepub = self.is_dictionary = False
self.yj_containers = []
self.kpf_container = None
self.load_symbol_catalog()
def load_symbol_catalog(self):
if self.symbol_catalog_filename is not None:
if not os.path.isfile(self.symbol_catalog_filename):
raise Exception(
"Symbol catalog %s does not exist" % self.symbol_catalog_filename
)
translation_catalog = SymbolTableCatalog()
catalog_symtab = LocalSymbolTable(catalog=translation_catalog)
try:
IonText(catalog_symtab).deserialize_multiple_values(
file_read_utf8(self.symbol_catalog_filename), import_symbols=True
)
except Exception:
log.error(
"Failed to parse symbol catalog %s" % self.symbol_catalog_filename
)
raise
translation_symtab = translation_catalog.get_shared_symbol_table(
YJ_SYMBOLS.name
)
if translation_symtab is None:
raise Exception(
"Symbol catalog %s does not contain a definition for YJ_symbols"
% self.symbol_catalog_filename
)
catalog_symtab.report()
log.info(
"Symbol catalog defines %d symbols in YJ_symbols"
% len(translation_symtab.symbols)
)
else:
translation_symtab = IonSharedSymbolTable(YJ_SYMBOLS.name)
self.symtab.set_translation(translation_symtab)
def final_actions(self, do_symtab_report=True):
if do_symtab_report:
self.symtab.report()
flush_unicode_cache()
temp_file_cleanup()
def convert_to_single_kfx(self):
self.decode_book()
if self.is_dictionary:
raise Exception("Cannot serialize dictionary as KFX container")
if self.is_kpf_prepub:
raise Exception("Cannot serialize KPF as KFX container without fix-up")
result = KfxContainer(self.symtab, fragments=self.fragments).serialize()
if len(result) > MAX_KFX_CONTAINER_SIZE:
log.warning(
"KFX container created may be too large for some devices (%d bytes)"
% len(result)
)
pass
self.final_actions()
return result
def convert_to_epub(self, epub2_desired=False):
from .yj_to_epub import KFX_EPUB
self.decode_book()
result = KFX_EPUB(self, epub2_desired).decompile_to_epub()
self.final_actions()
return result
def convert_to_pdf(self):
from .yj_to_pdf import KFX_PDF
self.decode_book()
if self.has_pdf_resource:
result = KFX_PDF(self).extract_pdf_resources()
elif self.is_fixed_layout:
result = KFX_PDF(self).convert_image_resources()
else:
result = None
self.final_actions()
return result
def get_metadata(self):
self.locate_book_datafiles()
yj_datafile_containers = []
for datafile in self.container_datafiles:
try:
container = self.get_container(datafile, ignore_drm=True)
if container is not None:
container.deserialize(ignore_drm=True)
yj_datafile_containers.append((datafile, container))
except Exception as e:
log.warning(
"Failed to extract content from %s: %s" % (datafile.name, repr(e))
)
for datafile, container in yj_datafile_containers:
try:
self.fragments.extend(container.get_fragments())
except Exception as e:
log.warning(
"Failed to extract content from %s: %s" % (datafile.name, repr(e))
)
continue
if self.has_metadata() and self.has_cover_data():
break
if not self.has_metadata():
raise Exception("Failed to locate a KFX container with metadata")
self.final_actions(do_symtab_report=False)
return self.get_yj_metadata_from_book()
def convert_to_kpf(
self, conversion=None, flags=None, timeout_sec=None, cleaned_filename=None
):
from .generate_kpf_common import ConversionResult
from .generate_kpf_using_cli import KPR_CLI
if not self.datafile.is_real_file:
raise Exception("Cannot create KPF from stream")
infile = self.datafile.name
intype = os.path.splitext(infile)[1]
if not conversion:
conversion = "KPR_CLI"
flags = set() if flags is None else set(flags)
options = conversion.split("/")
conversion_name = options[0]
flags |= set(options[1:])
ALL_TYPES = [".doc", ".docx", ".epub", ".mobi", ".opf"]
if conversion_name == "KPR_CLI" and intype in ALL_TYPES:
conversion_sequence = KPR_CLI()
else:
return ConversionResult(
error_msg="Cannot generate KPF from %s file using %s"
% (intype, conversion_name)
)
try:
result = conversion_sequence.convert(
infile, flags, timeout_sec, cleaned_filename
)
except Exception as e:
traceback.print_exc()
result = ConversionResult(error_msg=repr(e))
self.final_actions(do_symtab_report=False)
return result
def convert_to_zip_unpack(self):
self.decode_book()
result = ZipUnpackContainer(self.symtab, fragments=self.fragments).serialize()
self.final_actions()
return result
def convert_to_json_content(self):
self.decode_book()
result = JsonContentContainer(self).serialize()
self.final_actions()
return result
def decode_book(
self,
set_metadata=None,
set_approximate_pages=None,
pure=False,
retain_yj_locals=False,
):
if self.fragments:
if (
set_metadata is not None
or set_approximate_pages is not None
or retain_yj_locals
):
raise Exception(
"Attempt to change metadata after book has already been decoded"
)
return
self.locate_book_datafiles()
for datafile in self.container_datafiles:
log.info("Processing container: %s" % datafile.name)
container = self.get_container(datafile)
if container:
container.deserialize()
self.yj_containers.append(container)
for container in self.yj_containers:
self.fragments.extend(container.get_fragments())
if self.is_kpf_prepub:
self.fix_kpf_prepub_book(not pure, retain_yj_locals)
if True:
self.check_consistency()
if not pure:
if set_metadata is not None:
self.set_yj_metadata_to_book(set_metadata)
if set_approximate_pages is not None and set_approximate_pages >= 0:
try:
self.create_approximate_page_list(set_approximate_pages)
except Exception as e:
traceback.print_exc()
log.error(
"Exception creating approximate page numbers: %s" % repr(e)
)
try:
self.report_features_and_metadata(unknown_only=False)
except Exception as e:
traceback.print_exc()
log.error("Exception checking book features and metadata: %s" % repr(e))
self.check_fragment_usage(rebuild=not pure, ignore_extra=False)
self.check_symbol_table(rebuild=not pure)
self.final_actions()
def locate_book_datafiles(self):
self.container_datafiles = []
if self.datafile.is_real_file and os.path.isdir(self.datafile.name):
self.locate_files_from_dir(self.datafile.name)
elif self.datafile.ext in [".azw8", ".ion", ".kfx", ".kpf"]:
self.container_datafiles.append(self.datafile)
if self.datafile.ext == ".kfx" and self.datafile.is_real_file:
sdr_dirname = os.path.splitext(self.datafile.name)[0] + ".sdr"
if os.path.isdir(sdr_dirname):
self.locate_files_from_dir(sdr_dirname)
elif self.datafile.ext in [".kfx-zip", ".zip"]:
with self.datafile.as_ZipFile() as zf:
for info in zf.infolist():
if posixpath.basename(info.filename) in ["book.ion", "book.kdf"]:
self.container_datafiles.append(self.datafile)
break
else:
for info in zf.infolist():
self.check_located_file(
info.filename, zf.read(info), self.datafile
)
else:
raise Exception(
"Unknown main file type. Must be azw8, ion, kfx, kfx-zip, kpf, or zip."
)
if not self.container_datafiles:
raise Exception("No KFX containers found. This book is not in KFX format.")
self.container_datafiles = sorted(self.container_datafiles)
def locate_files_from_dir(self, directory, match=None):
for dirpath, dirnames, filenames in os.walk(directory):
for fn in filenames:
if (not match) or match == fn:
self.check_located_file(os.path.join(dirpath, fn))
def check_located_file(self, name, data=None, parent=None):
basename = posixpath.basename(name.replace("\\", "/"))
ext = os.path.splitext(basename)[1]
if ext in [".azw", ".azw8", ".azw9", ".kfx", ".md", ".res", ".yj"]:
self.container_datafiles.append(DataFile(name, data, parent))
def get_container(self, datafile, ignore_drm=False):
if datafile.ext == ".ion":
return IonTextContainer(self.symtab, datafile)
data = datafile.get_data()
if data.startswith(ZIP_SIGNATURE):
with datafile.as_ZipFile() as zf:
for info in zf.infolist():
if posixpath.basename(info.filename) in ["book.ion", "book.kdf"]:
if info.filename.endswith(".kdf"):
return KpfContainer(self.symtab, datafile, book=self)
else:
return ZipUnpackContainer(self.symtab, datafile)
if data.startswith(KpfContainer.KDF_SIGNATURE):
return KpfContainer(self.symtab, datafile, book=self)
if data.startswith(KfxContainer.SIGNATURE):
return KfxContainer(self.symtab, datafile)
if data.startswith(KfxContainer.DRM_SIGNATURE):
if ignore_drm:
return None
raise KFXDRMError(
"Book container %s has DRM and cannot be converted" % datafile.name
)
if data[0x3C : 0x3C + 8] in [b"BOOKMOBI", b"RBINCONT"]:
raise Exception("File format is MOBI (not KFX) for %s" % datafile.name)
raise Exception(
"Unable to determine KFX container type of %s (%s)"
% (datafile.name, bytes_to_separated_hex(data[:8]))
)

View File

@@ -0,0 +1,404 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import functools
from .ion import IonAnnotation, IonAnnots, IonBLOB, IonList, IonSymbol, ion_type
from .python_transition import IS_PYTHON2
from .utilities import list_symbols, natural_sort_key, type_name
if IS_PYTHON2:
from .python_transition import repr, str
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
DRMION_SIGNATURE = b"\xeaDRMION\xee"
CONTAINER_FORMAT_KPF = "KPF"
CONTAINER_FORMAT_KFX_MAIN = "KFX main"
CONTAINER_FORMAT_KFX_METADATA = "KFX metadata"
CONTAINER_FORMAT_KFX_ATTACHABLE = "KFX attachable"
RAW_FRAGMENT_TYPES = {"$418", "$417"}
PREFERED_FRAGMENT_TYPE_ORDER = [
"$ion_symbol_table",
"$270",
"$593",
"$585",
"$490",
"$258",
"$538",
"$389",
"$390",
"$260",
"$259",
"$608",
"$145",
"$756",
"$692",
"$157",
"$391",
"$266",
"$394",
"$264",
"$265",
"$550",
"$609",
"$621",
"$611",
"$610",
"$597",
"$267",
"$387",
"$395",
"$262",
"$164",
"$418",
"$417",
"$419",
]
ROOT_FRAGMENT_TYPES = {
"$ion_symbol_table",
"$270",
"$490",
"$389",
"$419",
"$585",
"$538",
"$262",
"$593",
"$550",
"$258",
"$265",
"$264",
"$395",
"$390",
"$621",
"$611",
}
SINGLETON_FRAGMENT_TYPES = ROOT_FRAGMENT_TYPES - {
"$270",
"$262",
"$593",
}
REQUIRED_BOOK_FRAGMENT_TYPES = {
"$ion_symbol_table",
"$270",
"$490",
"$389",
"$419",
"$538",
"$550",
"$258",
"$265",
"$264",
"$611",
}
ALLOWED_BOOK_FRAGMENT_TYPES = {
"$266",
"$597",
"$418",
"$417",
"$394",
"$145",
"$585",
"$610",
"$164",
"$262",
"$593",
"$391",
"$692",
"$387",
"$395",
"$756",
"$260",
"$267",
"$390",
"$609",
"$259",
"$608",
"$157",
"$621",
}
KNOWN_FRAGMENT_TYPES = REQUIRED_BOOK_FRAGMENT_TYPES | ALLOWED_BOOK_FRAGMENT_TYPES
CONTAINER_FRAGMENT_TYPES = [
"$270",
"$593",
"$ion_symbol_table",
"$419",
]
class YJContainer(object):
def __init__(self, symtab, datafile=None, fragments=None):
self.symtab = symtab
self.datafile = datafile
self.fragments = YJFragmentList() if fragments is None else fragments
def get_fragments(self):
return self.fragments
@functools.total_ordering
class YJFragmentKey(IonAnnots):
def __new__(cls, arg=None, ftype=None, fid=None, annot=None):
if arg is not None:
raise Exception("YJFragmentKey initializer missing keyword")
if annot is not None:
return IonAnnots.__new__(cls, tuple(annot))
if fid is None:
return IonAnnots.__new__(cls, [IonSymbol(ftype)])
if ftype is None:
return IonAnnots.__new__(cls, [IonSymbol(fid)])
return IonAnnots.__new__(cls, [IonSymbol(fid), IonSymbol(ftype)])
def sort_key(self):
return (
PREFERED_FRAGMENT_TYPE_ORDER.index(self.ftype)
if self.ftype in PREFERED_FRAGMENT_TYPE_ORDER
else len(PREFERED_FRAGMENT_TYPE_ORDER),
natural_sort_key(self.fid),
)
def __eq__(self, other):
if isinstance(other, YJFragment):
return self == other.annotations
if isinstance(other, YJFragmentKey):
return tuple(self) == tuple(other)
raise Exception("YJFragmentKey __eq__: comparing with %s" % type_name(other))
def __lt__(self, other):
if isinstance(other, YJFragment):
return self < other.annotations
if isinstance(other, YJFragmentKey):
return self.sort_key() < other.sort_key()
raise Exception("YJFragmentKey __lt__: comparing with %s" % type_name(other))
def __hash__(self):
return hash(tuple(self))
@property
def fid(self):
return self[0]
@fid.setter
def fid(self, value):
raise Exception("Attempt to modify YJFragmentKey fid")
@property
def ftype(self):
return self[-1]
@ftype.setter
def ftype(self, value):
raise Exception("Attempt to modify YJFragmentKey ftype")
@functools.total_ordering
class YJFragment(IonAnnotation):
def __init__(self, arg=None, ftype=None, fid=None, value=None):
if isinstance(arg, YJFragmentKey):
IonAnnotation.__init__(self, arg, value)
elif isinstance(arg, IonAnnotation):
IonAnnotation.__init__(
self, YJFragmentKey(annot=arg.annotations), arg.value
)
else:
IonAnnotation.__init__(self, YJFragmentKey(ftype=ftype, fid=fid), value)
def __hash__(self):
return hash(self.annotations)
def __eq__(self, other):
if isinstance(other, YJFragment):
return self.annotations == other.annotations
if isinstance(other, YJFragmentKey):
return self.annotations == other
raise Exception("YJFragment __eq__: comparing with %s" % type_name(other))
def __lt__(self, other):
if isinstance(other, YJFragment):
return self.annotations < other.annotations
if isinstance(other, YJFragmentKey):
return self.annotations < other
raise Exception("YJFragment __lt__: comparing with %s" % type_name(other))
@property
def fid(self):
return self.annotations[0]
@fid.setter
def fid(self, value):
raise Exception("Attempt to modify YJFragment fid")
@property
def ftype(self):
return self.annotations[-1]
@ftype.setter
def ftype(self, value):
raise Exception("Attempt to modify YJFragment ftype")
class YJFragmentList(IonList):
def __init__(self, *args):
IonList.__init__(self, *args)
self.yj_dirty = True
self.yj_ftype_index = collections.defaultdict(list)
self.yj_fragment_index = collections.defaultdict(list)
def yj_rebuild_index(self):
self.yj_ftype_index.clear()
self.yj_fragment_index.clear()
for f in self:
if not isinstance(f, YJFragment):
raise Exception(
"YJFragmentList contains non-YJFragment: %s" % type_name(f)
)
self.yj_ftype_index[f.ftype].append(f)
self.yj_fragment_index[f].append(f)
self.yj_dirty = False
def get_all(self, ftype=None):
return self.get(ftype=ftype, all=True)
def get(self, ftype=None, default=None, fid=None, first=False, all=False):
key = ftype
if isinstance(key, int):
return list.__getitem__(self, key)
if self.yj_dirty:
self.yj_rebuild_index()
if isinstance(key, YJFragmentKey):
matches = self.yj_fragment_index.get(key, [])
elif fid is not None:
key = YJFragmentKey(ftype=ftype, fid=fid)
matches = self.yj_fragment_index.get(key, [])
else:
matches = self.yj_ftype_index.get(ftype, [])
if all:
return list(matches)
if not matches:
return default
if len(matches) > 1 and not first:
raise KeyError(
"YJFragmentList get has multiple matches for %s: %s"
% (repr(key), list_symbols(matches))
)
return matches[0]
def __getitem__(self, key):
fragment = self.get(key)
if fragment is None:
raise KeyError("YJFragmentList item is missing: %s" % repr(key))
return fragment
def append(self, value):
if not isinstance(value, YJFragment):
raise Exception(
"YJFragmentList append non-YJFragment: %s" % type_name(value)
)
IonList.append(self, value)
self.yj_dirty = True
def extend(self, values):
if not isinstance(values, YJFragmentList):
raise Exception(
"YJFragmentList extend non-YJFragmentList: %s" % type_name(values)
)
IonList.extend(self, values)
self.yj_dirty = True
def remove(self, value):
if not self.discard(value):
raise KeyError("YJFragmentList remove, item is missing: %s" % str(value))
def discard(self, value):
if not isinstance(value, YJFragment):
raise Exception(
"YJFragmentList remove non-YJFragment: %s" % type_name(value)
)
for i, f in enumerate(self):
if f is value:
self.pop(i)
self.yj_dirty = True
return True
return False
def ftypes(self):
if self.yj_dirty:
self.yj_rebuild_index()
return set(self.yj_ftype_index.keys())
def filtered(self, omit_resources=False, omit_large_blobs=False):
if not (omit_resources or omit_large_blobs):
return self
filtered_fragments = YJFragmentList()
for fragment in list(self):
if fragment.ftype in RAW_FRAGMENT_TYPES:
if omit_resources:
continue
if (
omit_large_blobs
and ion_type(fragment.value) is IonBLOB
and fragment.value.is_large()
):
fragment = YJFragment(
ftype=fragment.ftype,
fid=fragment.fid,
value=repr(fragment.value),
)
filtered_fragments.append(fragment)
return filtered_fragments
def clear(self):
del self[:]

View File

@@ -0,0 +1,896 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import io
import random
import string
from PIL import Image
from .ion import IS, IonBLOB, IonStruct, IonSymbol, ion_type, unannotated
from .message_logging import log
from .python_transition import IS_PYTHON2
from .utilities import (
convert_pdf_to_jpeg,
disable_debug_log,
jpeg_type,
list_symbols,
list_symbols_unsorted,
quote_name,
)
from .yj_container import YJFragment, YJFragmentKey
from .yj_structure import (
FORMAT_SYMBOLS,
KFX_COVER_RESOURCE,
METADATA_NAMES,
METADATA_SYMBOLS,
SYMBOL_FORMATS,
)
from .yj_versions import (
PACKAGE_VERSION_PLACEHOLDERS,
is_known_feature,
is_known_generator,
is_known_metadata,
)
if IS_PYTHON2:
from .python_transition import repr, str
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
class YJ_Metadata(object):
def __init__(self, author_sort_fn=None, replace_existing_authors_with_sort=False):
self.authors = []
self.author_sort_fn = (
author_sort_name if author_sort_fn is None else author_sort_fn
)
self.replace_existing_authors_with_sort = replace_existing_authors_with_sort
self.title = (
self.cde_content_type
) = self.asin = self.cover_image_data = self.description = None
self.issue_date = (
self.language
) = self.publisher = self.book_id = self.features = self.asset_id = None
class BookMetadata(object):
def get_yj_metadata_from_book(self):
yj_metadata = YJ_Metadata()
authors = []
fragment = self.fragments.get("$490")
if fragment is not None:
for cm in fragment.value.get("$491", {}):
if cm.get("$495", "") == "kindle_title_metadata":
for kv in cm.get("$258", []):
key = kv.get("$492", "")
val = kv.get("$307", "")
if key == "author":
authors.append(val)
elif key == "title":
yj_metadata.title = val
elif key == "cde_content_type":
yj_metadata.cde_content_type = val
elif key == "ASIN":
yj_metadata.asin = val
elif key == "description":
yj_metadata.description = val
elif key == "issue_date":
yj_metadata.issue_date = val
elif key == "language":
yj_metadata.language = val
elif key == "publisher":
yj_metadata.publisher = val
elif key == "book_id":
yj_metadata.book_id = val
elif key == "asset_id":
yj_metadata.asset_id = val
fragment = self.fragments.get("$258")
if fragment is not None:
for name, val in fragment.value.items():
key = METADATA_NAMES.get(name, "")
if key == "author" and not authors:
if " & " in val:
for author in val.split("&"):
authors.append(author.strip())
elif " and " in val:
auths = val.split(" and ")
if len(auths) == 2 and "," in auths[0] and "," not in auths[1]:
auths = auths[0].split(",") + [auths[1]]
for author in auths:
authors.append(author.strip())
elif val:
authors.append(val)
elif key == "title" and not yj_metadata.title:
yj_metadata.title = val
elif key == "cde_content_type" and not yj_metadata.cde_content_type:
yj_metadata.cde_content_type = val
elif key == "ASIN" and not yj_metadata.asin:
yj_metadata.asin = val
elif key == "description" and not yj_metadata.description:
yj_metadata.description = val
elif key == "issue_date" and not yj_metadata.issue_date:
yj_metadata.issue_date = val
elif key == "language" and not yj_metadata.language:
yj_metadata.language = val
elif key == "publisher" and not yj_metadata.publisher:
yj_metadata.publisher = val
elif key == "asset_id" and not yj_metadata.asset_id:
yj_metadata.asset_id = val
yj_metadata.authors = []
for author in authors:
author = unsort_author_name(author)
if author and author not in yj_metadata.authors:
yj_metadata.authors.append(author)
cover_image_data = self.get_cover_image_data()
if cover_image_data is not None:
yj_metadata.cover_image_data = cover_image_data
yj_metadata.features = self.get_features()
return yj_metadata
def set_yj_metadata_to_book(self, yj_metadata):
authors = (
[yj_metadata.author_sort_fn(author) for author in yj_metadata.authors]
if yj_metadata.authors is not None
else None
)
if yj_metadata.asin is True:
yj_metadata.asin = "".join(
random.choice(string.ascii_uppercase + string.digits) for _ in range(32)
)
book_metadata_fragment = self.fragments.get("$490")
metadata_fragment = self.fragments.get("$258")
if book_metadata_fragment is None and metadata_fragment is None:
log.error("Cannot set metadata due to missing metadata fragments in book")
cover_image = None
if yj_metadata.cover_image_data is not None:
new_cover_image_data = self.fix_cover_image_data(
yj_metadata.cover_image_data
)
if new_cover_image_data != self.get_cover_image_data():
cover_image = self.set_cover_image_data(new_cover_image_data)
if book_metadata_fragment is not None:
for cm in book_metadata_fragment.value.get("$491", {}):
if cm.get("$495", "") == "kindle_title_metadata":
new_ksv = []
for kv in cm.get("$258", []):
key = kv.get("$492", "")
val = kv.get("$307", "")
if (
key == "author"
and yj_metadata.replace_existing_authors_with_sort
):
if authors is None:
authors = []
authors.append(yj_metadata.author_sort_fn(val))
elif (
(key == "author" and authors is not None)
or (key == "title" and yj_metadata.title is not None)
or (
key == "cde_content_type"
and yj_metadata.cde_content_type is not None
)
or (key == "ASIN" and yj_metadata.asin is not None)
or (key == "content_id" and yj_metadata.asin is not None)
or (key == "cover_image" and cover_image is not None)
or (
key == "description"
and yj_metadata.description is not None
)
or (
key == "issue_date"
and yj_metadata.issue_date is not None
)
or (key == "language" and yj_metadata.language is not None)
or (
key == "publisher" and yj_metadata.publisher is not None
)
):
pass
elif key:
new_ksv.append((key, len(new_ksv), val))
if authors is not None:
for author in authors:
new_ksv.append(("author", len(new_ksv), author))
if yj_metadata.title is not None:
new_ksv.append(("title", len(new_ksv), yj_metadata.title))
if yj_metadata.cde_content_type is not None:
new_ksv.append(
(
"cde_content_type",
len(new_ksv),
yj_metadata.cde_content_type,
)
)
if yj_metadata.asin is not None:
new_ksv.append(("ASIN", len(new_ksv), yj_metadata.asin))
new_ksv.append(("content_id", len(new_ksv), yj_metadata.asin))
if cover_image is not None:
new_ksv.append(("cover_image", len(new_ksv), cover_image))
if yj_metadata.description is not None:
new_ksv.append(
("description", len(new_ksv), yj_metadata.description)
)
if yj_metadata.issue_date is not None:
new_ksv.append(
("issue_date", len(new_ksv), yj_metadata.issue_date)
)
if yj_metadata.language is not None:
new_ksv.append(("language", len(new_ksv), yj_metadata.language))
if yj_metadata.publisher is not None:
new_ksv.append(
("publisher", len(new_ksv), yj_metadata.publisher)
)
cm[IS("$258")] = [
IonStruct(IS("$492"), k, IS("$307"), v)
for k, s, v in sorted(new_ksv)
]
if metadata_fragment is not None:
mdx = metadata_fragment.value
if not (len(mdx) == 0 or (len(mdx) == 1 and "$169" in mdx)):
if authors is not None:
mdx[IS("$222")] = " & ".join(authors)
else:
mdx.pop("$222", None)
if yj_metadata.title is not None:
mdx[IS("$153")] = yj_metadata.title
else:
mdx.pop("$153", None)
if yj_metadata.cde_content_type is not None:
mdx[IS("$251")] = yj_metadata.cde_content_type
else:
mdx.pop("$251", None)
if yj_metadata.asin is not None:
mdx[IS("$224")] = yj_metadata.asin
else:
mdx.pop("$224", None)
if cover_image is not None:
mdx[IS("$424")] = IS(cover_image)
else:
mdx.pop("$424", None)
if yj_metadata.description is not None:
mdx[IS("$154")] = yj_metadata.description
else:
mdx.pop("$154", None)
if yj_metadata.issue_date is not None:
mdx[IS("$219")] = yj_metadata.issue_date
else:
mdx.pop("$219", None)
if yj_metadata.language is not None:
mdx[IS("$10")] = yj_metadata.language
else:
mdx.pop("$10", None)
if yj_metadata.publisher is not None:
mdx[IS("$232")] = yj_metadata.publisher
else:
mdx.pop("$232", None)
def has_metadata(self):
return (
self.fragments.get(YJFragmentKey(ftype="$490")) is not None
or self.fragments.get(YJFragmentKey(ftype="$258")) is not None
)
def has_cover_data(self):
return self.get_cover_image_data() is not None
def get_asset_id(self):
return self.get_metadata_value("asset_id")
@property
def cde_type(self):
if not hasattr(self, "_cached_cde_type"):
self._cached_cde_type = self.get_metadata_value("cde_content_type")
return self._cached_cde_type
@property
def is_magazine(self):
return self.cde_type == "MAGZ"
@property
def is_sample(self):
return self.cde_type == "EBSP"
@property
def is_print_replica(self):
if not hasattr(self, "_cached_is_print_replica"):
self._cached_is_print_replica = (
self.get_metadata_value(
"yj_textbook", category="kindle_capability_metadata"
)
is not None
)
return self._cached_is_print_replica
@property
def is_fixed_layout(self):
if not hasattr(self, "_cached_is_fixed_layout"):
self._cached_is_fixed_layout = (
self.get_metadata_value("yj_fixed_layout", "kindle_capability_metadata")
is not None
)
return self._cached_is_fixed_layout
@property
def is_illustrated_layout(self):
if not hasattr(self, "_cached_is_illustrated_layout"):
self._cached_is_illustrated_layout = (
self.get_feature_value("yj.illustrated_layout") is not None
)
return self._cached_is_illustrated_layout
@property
def is_conditional_structure(self):
if not hasattr(self, "_cached_is_conditional_structure"):
self._cached_is_conditional_structure = self.get_feature_value(
"yj.conditional_structure"
) is not None or (
self.get_feature_value("reflow-style", default=0) == 5
and not self.is_magazine
)
return self._cached_is_conditional_structure
@property
def is_kfx_v1(self):
if not hasattr(self, "_cached_is_kfx_v1"):
fragment = self.fragments.get("$270", first=True)
self._cached_is_kfx_v1 = (
fragment.value.get("version", 0) == 1 if fragment is not None else False
)
return self._cached_is_kfx_v1
@property
def has_pdf_resource(self):
if not hasattr(self, "_cached_has_pdf_resource"):
for fragment in self.fragments.get_all("$164"):
if fragment.value.get("$161") == "$565":
self._cached_has_pdf_resource = True
break
else:
self._cached_has_pdf_resource = False
return self._cached_has_pdf_resource
def get_metadata_value(self, name, category="kindle_title_metadata", default=None):
try:
fragment = self.fragments.get("$490")
if fragment is not None:
for cm in fragment.value["$491"]:
if cm["$495"] == category:
for kv in cm["$258"]:
if kv["$492"] == name:
return kv["$307"]
metadata_symbol = METADATA_SYMBOLS.get(name)
if metadata_symbol is not None:
fragment = self.fragments.get("$258")
if fragment is not None and metadata_symbol in fragment.value:
return fragment.value[metadata_symbol]
except Exception:
pass
return default
def get_feature_value(
self, feature, namespace="com.amazon.yjconversion", default=None
):
if namespace == "format_capabilities":
fragment = self.fragments.get("$593", first=True)
if fragment is not None:
for fc in fragment.value:
if fc.get("$492", "") == feature:
return fc.get("version", "")
else:
fragment = self.fragments.get("$585", first=True)
if fragment is not None:
for cf in fragment.value.get("$590", []):
if (
cf.get("$586", "") == namespace
and cf.get("$492", "") == feature
):
vi = cf.get("$589", {}).get("version", {})
major_version = vi.get("$587", 0)
minor_version = vi.get("$588", 0)
return (
major_version
if minor_version == 0
else (major_version, minor_version)
)
return default
def get_generators(self):
generators = set()
for fragment in self.fragments.get_all("$270"):
if "version" in fragment.value:
package_version = fragment.value.get("$588", "")
generators.add(
(
fragment.value.get("$587", ""),
package_version
if package_version not in PACKAGE_VERSION_PLACEHOLDERS
else "",
)
)
return generators
def get_features(self):
features = set()
features.add(("symbols", "max_id", self.symtab.local_min_id - 1))
for fragment in self.fragments.get_all("$593"):
for fc in fragment.value:
features.add(
("format_capabilities", fc.get("$492", ""), fc.get("version", ""))
)
fragment = self.fragments.get("$585", first=True)
if fragment is not None:
for cf in fragment.value.get("$590", []):
vi = cf.get("$589", {}).get("version", {})
major_version = vi.get("$587", 0)
minor_version = vi.get("$588", 0)
features.add(
(
cf.get("$586", ""),
cf.get("$492", ""),
major_version
if minor_version == 0
else (major_version, minor_version),
)
)
return features
def report_features_and_metadata(self, unknown_only=False):
report_generators = set()
for generator in sorted(self.get_generators()):
generator_version = ("%s/%s" % generator) if generator[1] else generator[0]
if not is_known_generator(generator[0], generator[1]):
log.warning("Unknown kfxgen: %s" % generator_version)
elif not unknown_only:
report_generators.add(generator_version)
if report_generators:
log.info("kfxgen version: %s" % list_symbols(report_generators))
report_features = set()
for namespace, key, value in sorted(self.get_features()):
value_str = (
quote_name(value)
if isinstance(value, str)
else (
".".join([str(v) for v in value])
if isinstance(value, tuple)
else str(value)
)
)
if is_known_feature(namespace, key, value):
if not unknown_only:
report_features.add("%s-%s" % (key, value_str))
elif namespace == "symbols":
log.warning(
"Unknown %s feature: %s-%s" % (namespace, key, str(value_str))
)
else:
log.error(
"Unknown %s feature: %s-%s" % (namespace, key, str(value_str))
)
if report_features:
log.info("Features: %s" % list_symbols(report_features))
metadata = []
fragment = self.fragments.get("$490", first=True)
if fragment is not None:
for cm in fragment.value.get("$491", {}):
category = cm.get("$495", "")
for kv in cm.get("$258", []):
metadata.append(
(
kv.get("$492", ""),
category,
len(metadata),
kv.get("$307", ""),
)
)
fragment = self.fragments.get("$258", first=True)
if fragment is not None:
for name, val in fragment.value.items():
name = METADATA_NAMES.get(name, name.tostring())
if name == "reading_orders":
val = len(val)
metadata.append((name, "metadata", len(metadata), val))
fragment = self.fragments.get("$389")
if fragment is not None:
for book_navigation in fragment.value:
for nav_container in book_navigation.get("$392", []):
if ion_type(nav_container) is IonSymbol:
nav_container = self.fragments.get(
ftype="$391", fid=nav_container
)
if nav_container is not None:
nav_container = unannotated(nav_container)
if nav_container.get("$235", None) == "$237":
num_pages = len(nav_container.get("$247", []))
if num_pages:
metadata.append(
(
"pages",
"book_navigation",
len(metadata),
num_pages,
)
)
report_metadata = []
for key, cat, seq, val in sorted(metadata):
if not is_known_metadata(cat, key, val):
log.warning("Unknown %s: %s=%s" % (cat, key, str(val)))
elif not unknown_only:
if key == "cover_image":
try:
cover_resource = self.fragments[
YJFragmentKey(ftype="$164", fid=val)
].value
cover_raw_data = None
if "$165" in cover_resource:
cover_raw_media = self.fragments.get(
ftype="$417", fid=cover_resource["$165"]
)
if cover_raw_media is not None:
cover_raw_data = cover_raw_media.value.tobytes()
resource_height = cover_resource.get("$423", 0)
resource_width = cover_resource.get("$422", 0)
if (
not (resource_width and resource_height)
) and cover_raw_data is not None:
with disable_debug_log():
cover = Image.open(io.BytesIO(cover_raw_data))
resource_width, resource_height = cover.size
cover.close()
val = "%dx%d" % (resource_width, resource_height)
cover_format = SYMBOL_FORMATS.get(
cover_resource["$161"], "unknown"
)
if cover_raw_data is not None:
cover_format = jpeg_type(cover_raw_data, cover_format)
if cover_format != "JPEG":
val += "-" + cover_format
except Exception:
val = "???"
elif key == "dictionary_lookup":
val = "%s-to-%s" % (val.get("$474", "?"), val.get("$163", "?"))
elif key == "description" and len(val) > 20:
val = "..."
meta_str = "%s=%s" % (key, quote_name(str(val)))
if meta_str not in report_metadata:
report_metadata.append(meta_str)
if report_metadata:
log.info("Metadata: %s" % list_symbols_unsorted(report_metadata))
def get_cover_image_data(self):
cover_image_resource = self.get_metadata_value("cover_image")
if not cover_image_resource:
return None
cover_resource = self.fragments.get(ftype="$164", fid=cover_image_resource)
if cover_resource is None:
return None
cover_fmt = cover_resource.value["$161"]
if ion_type(cover_fmt) is IonSymbol:
cover_fmt = SYMBOL_FORMATS[cover_fmt]
cover_raw_media = self.fragments.get(
ftype="$417", fid=cover_resource.value["$165"]
)
if cover_raw_media is None:
return None
return (
"jpeg" if cover_fmt == "jpg" else cover_fmt,
cover_raw_media.value.tobytes(),
)
def fix_cover_image_data(self, cover_image_data):
fmt = cover_image_data[0]
data = orig_data = cover_image_data[1]
if fmt.lower() in ["jpg", "jpeg"] and not data.startswith(b"\xff\xd8\xff\xe0"):
try:
with disable_debug_log():
cover = Image.open(io.BytesIO(data))
outfile = io.BytesIO()
cover.save(outfile, "jpeg", quality=90)
cover.close()
data = outfile.getvalue()
except Exception:
data = orig_data
if data.startswith(b"\xff\xd8\xff\xe0"):
log.info(
"Changed cover image from %s to JPEG/JFIF for Kindle lockscreen"
% jpeg_type(orig_data)
)
else:
log.error(
"Failed to change cover image from %s to JPEG/JFIF"
% jpeg_type(orig_data)
)
data = orig_data
return (fmt, data)
def set_cover_image_data(self, cover_image_data, update_cover_section=True):
fmt = cover_image_data[0].lower()
if fmt == "jpeg":
fmt = "jpg"
if fmt != "jpg":
raise Exception(
"Cannot set KFX cover image format to %s, must be JPEG" % fmt.upper()
)
cover_image = self.get_metadata_value("cover_image")
if cover_image is None:
cover_image = KFX_COVER_RESOURCE
cover_image_symbol = self.create_local_symbol(cover_image)
self.fragments.append(
YJFragment(
ftype="$164",
fid=cover_image_symbol,
value=IonStruct(IS("$175"), cover_image_symbol),
)
)
data = cover_image_data[1]
cover_resource = self.update_image_resource_and_media(
cover_image, data, fmt, update_cover_section
)
if "$214" in cover_resource:
with disable_debug_log():
cover_thumbnail = Image.open(io.BytesIO(data))
cover_thumbnail.thumbnail((512, 512), Image.ANTIALIAS)
outfile = io.BytesIO()
cover_thumbnail.save(
outfile, "jpeg" if fmt == "jpg" else fmt, quality=90
)
cover_thumbnail.close()
thumbnail_data = outfile.getvalue()
thumbnail_resource = unannotated(cover_resource["$214"])
self.update_image_resource_and_media(
str(thumbnail_resource), thumbnail_data, fmt
)
return cover_image
def update_image_resource_and_media(
self, resource_name, data, fmt, update_cover_section=False
):
cover_resource = self.fragments.get(ftype="$164", fid=resource_name).value
cover_resource[IS("$161")] = IS(FORMAT_SYMBOLS[fmt])
cover_resource[IS("$162")] = "image/" + fmt
cover_resource.pop("$56", None)
cover_resource.pop("$57", None)
cover_resource.pop("$66", None)
cover_resource.pop("$67", None)
cover = Image.open(io.BytesIO(data))
width, height = cover.size
cover.close()
orig_width = cover_resource.get("$422", 0)
orig_height = cover_resource.get("$423", 0)
cover_resource[IS("$422")] = width
cover_resource[IS("$423")] = height
if "$165" in cover_resource:
self.fragments[
YJFragmentKey(ftype="$417", fid=cover_resource["$165"])
].value = IonBLOB(data)
else:
location = "%s.%s" % (resource_name, fmt)
cover_resource[IS("$165")] = location
self.fragments.append(
YJFragment(
ftype="$417",
fid=self.create_local_symbol(location),
value=IonBLOB(data),
)
)
if update_cover_section and (width != orig_width or height != orig_height):
section_updated = False
if self.locate_cover_image_resource_from_content() == resource_name:
section_names = self.ordered_section_names()
if len(section_names) > 0:
cover_section = self.fragments.get(
ftype="$260", fid=section_names[0]
).value
page_templates = cover_section["$141"]
page_template = (
page_templates[0] if len(page_templates) == 1 else {}
)
if (
page_template.get("$159") == "$270"
and page_template.get("$156") == "$326"
and page_template.get("$140") == "$320"
and page_template.get("$66", -1) == orig_width
and page_template.get("$67", -1) == orig_height
):
page_template[IS("$66")] = width
page_template[IS("$67")] = height
section_updated = True
if not section_updated:
log.info("First page image dimensions were not updated")
return cover_resource
def locate_cover_image_resource_from_content(self, replace_pdf=False):
section_names = self.ordered_section_names()
if not section_names:
return None
cover_section = self.fragments.get(ftype="$260", fid=section_names[0]).value
for page_template in cover_section["$141"]:
story_name = page_template.get("$176")
if story_name:
break
else:
return None
cover_story = self.fragments.get(ftype="$259", fid=story_name).value
def scan_content_for_image(content):
if content.get("$159") == "$271" and "$175" in content:
return content["$175"]
for subcontent in content.get("$146", {}):
img = scan_content_for_image(subcontent)
if img is not None:
return img
return None
resource_name = scan_content_for_image(cover_story)
if resource_name is None:
return None
cover_resource = self.fragments.get(ftype="$164", fid=resource_name).value
if cover_resource[IS("$161")] != "$565":
return resource_name
if not replace_pdf:
return None
location = cover_resource["$165"]
raw_media = self.fragments[YJFragmentKey(ftype="$417", fid=location)].value
page_num = cover_resource.get("$564", 0) + 1
try:
jpeg_data = convert_pdf_to_jpeg(raw_media, page_num)
except Exception as e:
log.error(
"Exception during conversion of PDF '%s' page %d to JPEG: %s"
% (location, page_num, repr(e))
)
return None
return self.set_cover_image_data(
("jpeg", jpeg_data), update_cover_section=False
)
def author_sort_name(author):
PERSON_SUFFIXES = {
"phd",
"md",
"ba",
"ma",
"dds",
"msts",
"sr",
"senior",
"jr",
"junior",
"ii",
"iii",
"iv",
}
al = author.split()
if len(al) < 2:
return author
if len(al) > 2 and al[-1].replace(".", "").lower() in PERSON_SUFFIXES:
if al[-2].endswith(","):
al[-2] = al[-2][:-1]
al = al[0:-2] + ["%s %s" % (al[-2], al[-1])]
if "," in "".join(al):
return author
return al[-1] + ", " + " ".join(al[:-1])
def unsort_author_name(author):
if ", " in author:
last, sep, first = author.partition(", ")
author = first + " " + last
return author

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,852 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
class IonSharedSymbolTable(object):
def __init__(self, name, version=1, symbols=[]):
self.name = name
self.version = version
self.symbols = symbols
SYSTEM_SYMBOL_TABLE = IonSharedSymbolTable(
name="$ion",
version=1,
symbols=[
"$ion",
"$ion_1_0",
"$ion_symbol_table",
"name",
"version",
"imports",
"symbols",
"max_id",
"$ion_shared_symbol_table",
],
)
YJ_SYMBOLS = IonSharedSymbolTable(
name="YJ_symbols",
version=10,
symbols=[
"$10",
"$11",
"$12",
"$13",
"$14?",
"$15",
"$16",
"$17?",
"$18?",
"$19",
"$20?",
"$21",
"$22?",
"$23",
"$24",
"$25?",
"$26?",
"$27",
"$28?",
"$29?",
"$30?",
"$31",
"$32",
"$33",
"$34",
"$35",
"$36",
"$37?",
"$38?",
"$39?",
"$40?",
"$41",
"$42",
"$43?",
"$44",
"$45",
"$46",
"$47",
"$48",
"$49",
"$50",
"$51",
"$52",
"$53",
"$54",
"$55",
"$56",
"$57",
"$58",
"$59",
"$60",
"$61",
"$62",
"$63",
"$64",
"$65",
"$66",
"$67",
"$68",
"$69",
"$70",
"$71?",
"$72",
"$73",
"$74?",
"$75",
"$76",
"$77",
"$78?",
"$79?",
"$80?",
"$81?",
"$82?",
"$83",
"$84",
"$85",
"$86",
"$87",
"$88",
"$89",
"$90",
"$91",
"$92",
"$93",
"$94",
"$95",
"$96",
"$97",
"$98",
"$99",
"$100",
"$101?",
"$102",
"$103?",
"$104",
"$105",
"$106",
"$107",
"$108",
"$109?",
"$110?",
"$111?",
"$112",
"$113?",
"$114?",
"$115?",
"$116?",
"$117?",
"$118",
"$119?",
"$120?",
"$121?",
"$122?",
"$123?",
"$124?",
"$125",
"$126",
"$127",
"$128?",
"$129?",
"$130?",
"$131",
"$132",
"$133",
"$134",
"$135",
"$136",
"$137",
"$138?",
"$139?",
"$140",
"$141",
"$142",
"$143",
"$144",
"$145",
"$146",
"$147?",
"$148",
"$149",
"$150",
"$151",
"$152",
"$153",
"$154",
"$155",
"$156",
"$157",
"$158?",
"$159",
"$160?",
"$161",
"$162",
"$163",
"$164",
"$165",
"$166",
"$167",
"$168?",
"$169",
"$170",
"$171",
"$172?",
"$173",
"$174",
"$175",
"$176",
"$177?",
"$178",
"$179",
"$180",
"$181",
"$182",
"$183",
"$184",
"$185",
"$186",
"$187?",
"$188?",
"$189?",
"$190?",
"$191?",
"$192",
"$193?",
"$194?",
"$195?",
"$196?",
"$197?",
"$198?",
"$199",
"$200",
"$201",
"$202",
"$203",
"$204?",
"$205",
"$206",
"$207",
"$208",
"$209",
"$210",
"$211",
"$212",
"$213",
"$214",
"$215",
"$216",
"$217",
"$218",
"$219",
"$220",
"$221?",
"$222",
"$223?",
"$224",
"$225?",
"$226?",
"$227?",
"$228?",
"$229?",
"$230",
"$231",
"$232",
"$233",
"$234?",
"$235",
"$236",
"$237",
"$238",
"$239",
"$240",
"$241",
"$242?",
"$243?",
"$244",
"$245",
"$246",
"$247",
"$248",
"$249",
"$250",
"$251",
"$252",
"$253",
"$254",
"$255",
"$256?",
"$257?",
"$258",
"$259",
"$260",
"$261?",
"$262",
"$263?",
"$264",
"$265",
"$266",
"$267",
"$268?",
"$269",
"$270",
"$271",
"$272",
"$273",
"$274",
"$275?",
"$276",
"$277",
"$278",
"$279",
"$280?",
"$281",
"$282",
"$283",
"$284",
"$285",
"$286",
"$287",
"$288?",
"$289?",
"$290?",
"$291?",
"$292",
"$293",
"$294",
"$295?",
"$296",
"$297?",
"$298",
"$299",
"$300?",
"$301?",
"$302?",
"$303?",
"$304",
"$305",
"$306",
"$307",
"$308",
"$309?",
"$310",
"$311",
"$312",
"$313?",
"$314",
"$315?",
"$316?",
"$317?",
"$318",
"$319",
"$320",
"$321",
"$322",
"$323",
"$324",
"$325",
"$326",
"$327?",
"$328",
"$329",
"$330",
"$331",
"$332?",
"$333?",
"$334",
"$335",
"$336",
"$337",
"$338?",
"$339?",
"$340",
"$341",
"$342",
"$343",
"$344",
"$345",
"$346",
"$347",
"$348",
"$349",
"$350",
"$351",
"$352",
"$353",
"$354?",
"$355",
"$356",
"$357",
"$358?",
"$359",
"$360",
"$361",
"$362",
"$363",
"$364?",
"$365?",
"$366?",
"$367?",
"$368?",
"$369",
"$370",
"$371",
"$372",
"$373",
"$374",
"$375",
"$376",
"$377",
"$378",
"$379",
"$380?",
"$381",
"$382",
"$383",
"$384",
"$385",
"$386",
"$387",
"$388?",
"$389",
"$390",
"$391",
"$392",
"$393",
"$394",
"$395",
"$396",
"$397?",
"$398?",
"$399?",
"$400?",
"$401?",
"$402?",
"$403",
"$404?",
"$405?",
"$406?",
"$407?",
"$408?",
"$409",
"$410",
"$411",
"$412",
"$413",
"$414",
"$415",
"$416",
"$417",
"$418",
"$419",
"$420?",
"$421",
"$422",
"$423",
"$424",
"$425?",
"$426",
"$427",
"$428",
"$429",
"$430?",
"$431?",
"$432",
"$433",
"$434",
"$435?",
"$436",
"$437",
"$438",
"$439",
"$440?",
"$441",
"$442",
"$443?",
"$444?",
"$445?",
"$446?",
"$447",
"$448?",
"$449",
"$450?",
"$451?",
"$452?",
"$453",
"$454",
"$455",
"$456",
"$457",
"$458?",
"$459",
"$460",
"$461",
"$462",
"$463?",
"$464",
"$465",
"$466",
"$467?",
"$468",
"$469?",
"$470?",
"$471?",
"$472",
"$473?",
"$474",
"$475",
"$476",
"$477",
"$478",
"$479",
"$480",
"$481",
"$482",
"$483",
"$484",
"$485",
"$486",
"$487",
"$488",
"$489",
"$490",
"$491",
"$492",
"$493?",
"$494?",
"$495",
"$496",
"$497",
"$498",
"$499",
"$500",
"$501",
"$502",
"$503",
"$504?",
"$505",
"$506?",
"$507?",
"$508?",
"$509",
"$510?",
"$511?",
"$512?",
"$513?",
"$514?",
"$515?",
"$516?",
"$517?",
"$518?",
"$519?",
"$520?",
"$521?",
"$522?",
"$523?",
"$524?",
"$525",
"$526",
"$527?",
"$528",
"$529?",
"$530?",
"$531?",
"$532?",
"$533?",
"$534?",
"$535?",
"$536?",
"$537?",
"$538",
"$539?",
"$540?",
"$541?",
"$542?",
"$543?",
"$544?",
"$545?",
"$546",
"$547",
"$548",
"$549",
"$550",
"$551",
"$552",
"$553",
"$554",
"$555?",
"$556?",
"$557",
"$558",
"$559",
"$560",
"$561?",
"$562?",
"$563?",
"$564",
"$565",
"$566?",
"$567?",
"$568?",
"$569",
"$570",
"$571?",
"$572?",
"$573",
"$574?",
"$575?",
"$576",
"$577",
"$578?",
"$579?",
"$580",
"$581",
"$582?",
"$583",
"$584",
"$585",
"$586",
"$587",
"$588",
"$589",
"$590",
"$591",
"$592",
"$593",
"$594",
"$595",
"$596",
"$597",
"$598",
"$599?",
"$600?",
"$601",
"$602",
"$603?",
"$604",
"$605",
"$606",
"$607?",
"$608",
"$609",
"$610",
"$611",
"$612?",
"$613",
"$614",
"$615",
"$616",
"$617",
"$618",
"$619",
"$620?",
"$621",
"$622",
"$623",
"$624?",
"$625",
"$626?",
"$627?",
"$628",
"$629",
"$630",
"$631?",
"$632",
"$633",
"$634?",
"$635",
"$636",
"$637",
"$638",
"$639",
"$640",
"$641",
"$642",
"$643",
"$644",
"$645",
"$646",
"$647",
"$648",
"$649",
"$650",
"$651?",
"$652",
"$653?",
"$654?",
"$655",
"$656",
"$657",
"$658",
"$659",
"$660",
"$661?",
"$662?",
"$663",
"$664",
"$665",
"$666",
"$667?",
"$668",
"$669?",
"$670?",
"$671",
"$672",
"$673",
"$674",
"$675",
"$676",
"$677",
"$678",
"$679",
"$680",
"$681",
"$682",
"$683",
"$684",
"$685?",
"$686",
"$687",
"$688",
"$689",
"$690",
"$691?",
"$692",
"$693",
"$694?",
"$695?",
"$696",
"$697",
"$698",
"$699?",
"$700",
"$701",
"$702",
"$703",
"$704",
"$705",
"$706",
"$707",
"$708",
"$709?",
"$710?",
"$711?",
"$712?",
"$713?",
"$714?",
"$715?",
"$716?",
"$717",
"$718",
"$719",
"$720",
"$721?",
"$722?",
"$723?",
"$724",
"$725",
"$726",
"$727",
"$728",
"$729",
"$730",
"$731",
"$732",
"$733",
"$734",
"$735",
"$736",
"$737?",
"$738?",
"$739?",
"$740?",
"$741",
"$742",
"$743?",
"$744?",
"$745?",
"$746?",
"$747?",
"$748?",
"$749",
"$750",
"$751",
"$752",
"$753",
"$754",
"$755",
"$756",
"$757",
"$758",
"$759",
"$760",
"$761",
"$762",
"$763",
"$764?",
"$765",
"$766",
"$767?",
"$768?",
"$769?",
"$770?",
"$771?",
"$772?",
"$773",
"$774",
"$775?",
"$776?",
"$777?",
"$778",
"$779",
"$780",
"$781",
"$782",
"$783",
"$784",
"$785",
"$786",
"$787?",
"$788",
"$789",
"$790",
"$791",
"$792",
"$793",
"$794",
"$795",
"$796",
"$797",
"$798?",
"$799?",
"$800?",
"$801?",
"$802?",
"$803?",
"$804?",
"$805?",
"$806?",
"$807?",
"$808?",
"$809?",
"$810?",
"$811?",
"$812?",
"$813?",
"$814?",
"$815?",
"$816?",
"$817?",
"$818?",
"$819?",
"$820?",
"$821",
"$822",
"$823",
"$824",
"$825",
],
)

View File

@@ -0,0 +1,457 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import copy
import decimal
import re
from .epub_output import EPUB_Output
from .ion import (
IonAnnotation,
IonList,
IonSExp,
IonString,
IonStruct,
IonSymbol,
ion_type,
)
from .message_logging import log
from .python_transition import IS_PYTHON2
from .utilities import UUID_MATCH_RE, check_empty, list_symbols, truncate_list
from .yj_structure import SYM_TYPE
from .yj_to_epub_content import KFX_EPUB_Content
from .yj_to_epub_metadata import KFX_EPUB_Metadata
from .yj_to_epub_misc import KFX_EPUB_Misc
from .yj_to_epub_navigation import KFX_EPUB_Navigation
from .yj_to_epub_properties import GENERIC_FONT_NAMES, KFX_EPUB_Properties
from .yj_to_epub_resources import KFX_EPUB_Resources
if IS_PYTHON2:
from .python_transition import str
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
REPORT_MISSING_FONTS = True
RETAIN_USED_FRAGMENTS = False
RETAIN_UNUSED_RESOURCES = False
FRAGMENT_NAME_SYMBOL = {
"$266": "$180",
"$164": "$175",
"$391": "$239",
"$393": "$240",
"$260": "$174",
"$608": "$598",
"$259": "$176",
"$157": "$173",
}
class KFX_EPUB(
KFX_EPUB_Content,
KFX_EPUB_Metadata,
KFX_EPUB_Misc,
KFX_EPUB_Navigation,
KFX_EPUB_Properties,
KFX_EPUB_Resources,
):
DEBUG = False
def __init__(self, book, epub2_desired=False):
self.book = book
self.book_symbols = set()
self.book_data = self.organize_fragments_by_type(book.fragments)
self.is_kpf = book.kpf_container is not None
self.used_fragments = {}
self.epub = EPUB_Output(epub2_desired)
self.epub.get_anchor_uri = self.get_anchor_uri
self.determine_book_symbol_format()
decimal.getcontext().prec = 6
self.page_label_anchor_id = {}
self.reported_duplicate_page_label = set()
self.reported_pdf_errors = set()
self.used_kfx_styles = set()
self.missing_kfx_styles = set()
self.css_rules = {}
self.css_files = set()
self.missing_special_classes = set()
self.media_queries = collections.defaultdict(dict)
self.font_names = set()
self.missing_font_names = set()
self.font_name_replacements = {}
self.font_faces = []
self.location_filenames = {}
self.reported_characters = set()
self.text_combine_in_use = False
self.incorrect_font_quoting = set()
for name in GENERIC_FONT_NAMES:
self.fix_font_name(name, add=True, generic=True)
self.nav_container_section = {}
self.navto_anchor = {}
self.toc_entry_count = 0
self.anchor_uri = {}
self.anchor_elem = {}
self.anchor_id = {}
self.anchor_ids = set()
self.position_anchors = {}
self.anchor_positions = {}
self.used_anchors = set()
self.immovable_anchors = set()
self.page_anchor_id_label = {}
self.fix_condition_href = False
self.has_conditional_content = False
self.context_ = []
self.save_resources = True
self.cde_content_type = ""
self.resource_cache = {}
self.used_raw_media = set()
self.process_fonts()
self.process_document_data()
self.process_content_features()
self.process_metadata()
if self.epub.illustrated_layout:
raise Exception("Illustrated layout (Kindle in Motion) is not supported.")
self.set_condition_operators()
self.process_anchors()
self.process_navigation()
for style_name, yj_properties in self.book_data.get("$157", {}).items():
self.check_fragment_name(yj_properties, "$157", style_name, delete=False)
self.process_reading_order()
if self.cover_resource and not self.epub.html_cover:
try:
self.process_external_resource(
self.cover_resource
).manifest_entry.is_cover_image = True
except:
print(self.cover_resource, "+++++++++")
pass
self.fixup_anchors_and_hrefs()
self.update_default_font_and_language()
self.set_html_defaults()
self.fixup_styles_and_classes()
self.create_css_files()
self.prepare_book_parts()
if self.position_anchors:
pos = []
for id in self.position_anchors:
for offset in self.position_anchors[id]:
pos.append("%s.%s" % (id, offset))
log.error(
"Failed to locate %d referenced positions: %s"
% (len(pos), ", ".join(truncate_list(sorted(pos))))
)
if RETAIN_UNUSED_RESOURCES:
for external_resource in self.book_data.get("$164", {}):
self.process_external_resource(external_resource)
self.check_empty(self.book_data.pop("$164", {}), "external_resource")
self.report_duplicate_anchors()
raw_media = self.book_data.pop("$417", {})
for used_raw_media in self.used_raw_media:
raw_media.pop(used_raw_media)
self.check_empty(raw_media, "raw_media")
self.check_empty(self.book_data.pop("$260", {}), "$260")
storyline = self.book_data.pop("$259", {})
if not self.book.is_kpf_prepub:
self.check_empty(storyline, "$259")
kfx_styles = self.book_data.pop("$157", {})
for used_kfx_style in self.used_kfx_styles:
kfx_styles.pop(used_kfx_style)
self.check_empty(kfx_styles, "kfx styles")
self.book_data.pop("$270", None)
self.book_data.pop("$593", None)
self.book_data.pop("$ion_symbol_table", None)
self.book_data.pop("$270", None)
self.book_data.pop("$419", None)
self.book_data.pop("$145", None)
self.book_data.pop("$608", None)
self.book_data.pop("$692", None)
self.book_data.pop("$756", None)
self.book_data.pop("$550", None)
self.book_data.pop("$265", None)
self.book_data.pop("$264", None)
if "$395" in self.book_data:
resource_path = self.book_data.pop("$395")
for ent in resource_path.pop("$247", []):
ent.pop("$175", None)
ent.pop("$166", None)
self.check_empty(ent, "%s %s" % ("$395", "$247"))
self.check_empty(resource_path, "$395")
self.book_data.pop("$609", None)
self.book_data.pop("$621", None)
self.book_data.pop("$597", None)
self.book_data.pop("$610", None)
self.book_data.pop("$611", None)
self.book_data.pop("$387", None)
self.book_data.pop("$267", None)
self.check_empty(self.book_data, "Book fragments")
if self.missing_font_names:
if REPORT_MISSING_FONTS:
log.warning(
"Missing font family names: %s"
% list_symbols(self.missing_font_names)
)
else:
log.info(
"Missing referenced font family names: %s"
% list_symbols(self.missing_font_names)
)
if self.font_names:
log.info(
"Present referenced font family names: %s"
% list_symbols(self.font_names)
)
def decompile_to_epub(self):
return self.epub.generate_epub()
def organize_fragments_by_type(self, fragment_list):
font_count = 0
categorized_data = {}
last_container_id = None
for fragment in fragment_list:
id = fragment.fid
self.book_symbols.add(id)
if fragment.ftype == "$270":
id = last_container_id = IonSymbol(
"%s:%s"
% (fragment.value.get("$161", ""), fragment.value.get("$409", ""))
)
elif fragment.ftype == "$593":
id = last_container_id
elif fragment.ftype == "$262":
id = IonSymbol("%s-font-%03d" % (id, font_count))
font_count += 1
elif fragment.ftype == "$387":
id = IonSymbol("%s:%s" % (id, fragment.value["$215"]))
dt = categorized_data.setdefault(fragment.ftype, {})
if id not in dt:
dt[id] = self.replace_ion_data(fragment.value)
else:
log.error("Book contains multiple %s fragments" % str(fragment))
for category, ids in categorized_data.items():
if len(ids) == 1:
id = list(ids)[0]
if id == category:
categorized_data[category] = categorized_data[category][id]
elif None in ids:
log.error(
"Fragment list contains mixed null/non-null ids of type '%s'"
% category
)
return categorized_data
def determine_book_symbol_format(self):
sym_type_counts = collections.defaultdict(lambda: 0)
for book_symbol in self.book_symbols:
symbol_type = self.book.classify_symbol(book_symbol)
sym_type_counts[symbol_type] += 1
sym_type_counts[SYM_TYPE.ORIGINAL] += sym_type_counts[SYM_TYPE.UNKNOWN] // 10
symbol_quarum = (
sym_type_counts[SYM_TYPE.DICTIONARY]
+ sym_type_counts[SYM_TYPE.SHORT]
+ sym_type_counts[SYM_TYPE.BASE64]
+ sym_type_counts[SYM_TYPE.ORIGINAL]
) // 2
if sym_type_counts[
SYM_TYPE.SHORT
] >= symbol_quarum or "max_id" in self.book_data.get("$538", {}):
self.book_symbol_format = SYM_TYPE.SHORT
elif sym_type_counts[SYM_TYPE.DICTIONARY] >= symbol_quarum:
self.book_symbol_format = SYM_TYPE.DICTIONARY
elif sym_type_counts[SYM_TYPE.BASE64] >= symbol_quarum:
self.book_symbol_format = SYM_TYPE.BASE64
else:
self.book_symbol_format = SYM_TYPE.ORIGINAL
if self.book_symbol_format != SYM_TYPE.SHORT:
log.info("Book symbol format is %s" % self.book_symbol_format)
def unique_part_of_local_symbol(self, symbol):
name = str(symbol)
if self.book_symbol_format == SYM_TYPE.SHORT:
name = re.sub(r"^resource/", "", name, count=1)
pass
elif self.book_symbol_format == SYM_TYPE.DICTIONARY:
name = re.sub(r"^G", "", name, count=1)
elif self.book_symbol_format == SYM_TYPE.BASE64:
name = re.sub(r"^(resource/)?[a-zA-Z0-9_-]{22}", "", name, count=1)
else:
name = re.sub(
r"^V_[0-9]_[0-9](-PARA|-CHAR)?-[0-9]_[0-9]_[0-9a-f]{12,16}_[0-9a-f]{1,5}",
"",
name,
count=1,
)
name = re.sub(
r"^(fonts/|images/)?(res|resource)_[0-9]_[0-9]_[0-9a-f]{12,16}_[0-9a-f]{1,5}_",
"",
name,
count=1,
)
name = re.sub(UUID_MATCH_RE, "", name, count=1)
while name.startswith("-") or name.startswith("_"):
name = name[1:]
return name
def prefix_unique_part_of_symbol(self, unique_part, prefix):
if not unique_part:
return prefix
if re.match("^[A-Za-z0-9]+(-.+)?$", unique_part) or not re.match(
"^[A-Za-z]", unique_part
):
return "%s_%s" % (prefix, unique_part)
return unique_part
def replace_ion_data(self, f):
data_type = ion_type(f)
if data_type is IonAnnotation:
return self.replace_ion_data(f.value)
if data_type is IonList:
return [self.replace_ion_data(fc) for fc in f]
if data_type is IonSExp:
return IonSExp([self.replace_ion_data(fc) for fc in f])
if data_type is IonStruct:
newf = IonStruct()
for fk, fv in f.items():
newf[self.replace_ion_data(fk)] = self.replace_ion_data(fv)
return newf
if data_type is IonSymbol:
self.book_symbols.add(f)
return f
def get_fragment(self, ftype=None, fid=None, delete=True):
if ion_type(fid) not in [IonString, IonSymbol]:
return fid
if ftype in self.book_data:
fragment_container = self.book_data[ftype]
elif ftype == "$393" and "$394" in self.book_data:
fragment_container = self.book_data["$394"]
else:
fragment_container = {}
data = (
fragment_container.pop(fid, None) if delete else fragment_container.get(fid)
)
if data is None:
used_data = self.used_fragments.get((ftype, fid))
if used_data is not None:
if RETAIN_USED_FRAGMENTS:
log.warning(
"book fragment used multiple times: %s %s" % (ftype, fid)
)
data = used_data
else:
log.error("book fragment used multiple times: %s %s" % (ftype, fid))
data = IonStruct()
else:
log.error("book is missing fragment: %s %s" % (ftype, fid))
data = IonStruct()
else:
self.used_fragments[(ftype, fid)] = (
copy.deepcopy(data) if RETAIN_USED_FRAGMENTS else True
)
data_name = self.get_fragment_name(data, ftype, delete=False)
if data_name and data_name != fid:
log.error("Expected %s named %s but found %s" % (ftype, fid, data_name))
return data
def get_named_fragment(self, structure, ftype=None, delete=True, name_symbol=None):
return self.get_fragment(
ftype=ftype,
fid=structure.pop(name_symbol or FRAGMENT_NAME_SYMBOL[ftype]),
delete=delete,
)
def get_location_id(self, structure):
id = structure.pop("$155", None) or structure.pop("$598", None)
if id is not None:
id = str(id)
return id
def check_fragment_name(self, fragment_data, ftype, fid, delete=True):
name = self.get_fragment_name(fragment_data, ftype, delete)
if name != fid:
log.error("Fragment %s %s has incorrect name %s" % (ftype, fid, name))
def get_fragment_name(self, fragment_data, ftype, delete=True):
return self.get_structure_name(
fragment_data, FRAGMENT_NAME_SYMBOL[ftype], delete
)
def get_structure_name(self, structure, name_key, delete=True):
return (
structure.pop(name_key, None) if delete else structure.get(name_key, None)
)
def check_empty(self, a_dict, dict_name):
check_empty(a_dict, dict_name)
def fix_html_id(self, id):
return self.epub.fix_html_id(id)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,279 @@
from __future__ import absolute_import, division, print_function, unicode_literals
from .message_logging import log
from .python_transition import IS_PYTHON2
from .yj_structure import METADATA_NAMES, SYM_TYPE
from .yj_to_epub_properties import (
DEFAULT_DOCUMENT_FONT_FAMILY,
DEFAULT_DOCUMENT_FONT_SIZE,
DEFAULT_DOCUMENT_LINE_HEIGHT,
DEFAULT_FONT_NAMES,
DEFAULT_KC_COMIC_FONT_SIZE,
)
if IS_PYTHON2:
from .python_transition import str
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
ORIENTATIONS = {
"$385": "portrait",
"$386": "landscape",
"$349": "none",
}
class KFX_EPUB_Metadata(object):
def process_document_data(self):
document_data = self.book_data.pop("$538", {})
if "$433" in document_data:
orientation_lock_ = document_data.pop("$433")
if orientation_lock_ in ORIENTATIONS:
self.epub.orientation_lock = ORIENTATIONS[orientation_lock_]
else:
log.error("Unexpected orientation_lock: %s" % orientation_lock_)
self.epub.orientation_lock = "none"
else:
self.epub.orientation_lock = "none"
if "$436" in document_data:
selection = document_data.pop("$436")
if selection not in ["$442", "$441"]:
log.error("Unexpected document selection: %s" % selection)
if "$477" in document_data:
spacing_percent_base = document_data.pop("$477")
if spacing_percent_base != "$56":
log.error(
"Unexpected document spacing_percent_base: %s"
% spacing_percent_base
)
if "$581" in document_data:
pan_zoom = document_data.pop("$581")
if pan_zoom != "$441":
log.error("Unexpected document pan_zoom: %s" % pan_zoom)
if "$665" in document_data:
self.epub.set_book_type("comic")
comic_panel_view_mode = document_data.pop("$665")
if comic_panel_view_mode != "$666":
log.error(
"Unexpected comic panel view mode: %s" % comic_panel_view_mode
)
if "$668" in document_data:
auto_contrast = document_data.pop("$668")
if auto_contrast != "$573":
log.error("Unexpected auto_contrast: %s" % auto_contrast)
document_data.pop("$597", None)
if "max_id" in document_data:
max_id = document_data.pop("max_id")
if self.book_symbol_format != SYM_TYPE.SHORT:
log.error(
"Unexpected document_data max_id=%s for %s symbol format"
% (max_id, self.book_symbol_format)
)
elif self.book_symbol_format == SYM_TYPE.SHORT:
log.error(
"Book has %s symbol format without document_data max_id"
% self.book_symbol_format
)
document_data.pop("yj.semantics.book_theme_metadata", None)
document_data.pop("yj.semantics.containers_with_semantics", None)
document_data.pop("yj.semantics.page_number_begin", None)
document_data.pop("yj.print.settings", None)
document_data.pop("yj.authoring.auto_panel_settings_auto_mask_color_flag", None)
document_data.pop("yj.authoring.auto_panel_settings_mask_color", None)
document_data.pop("yj.authoring.auto_panel_settings_opacity", None)
document_data.pop("yj.authoring.auto_panel_settings_padding_bottom", None)
document_data.pop("yj.authoring.auto_panel_settings_padding_left", None)
document_data.pop("yj.authoring.auto_panel_settings_padding_right", None)
document_data.pop("yj.authoring.auto_panel_settings_padding_top", None)
self.reading_orders = document_data.pop("$169", [])
self.font_name_replacements["default"] = DEFAULT_DOCUMENT_FONT_FAMILY
doc_style = self.process_content_properties(document_data)
column_count = doc_style.pop("column-count", "auto")
if column_count != "auto":
log.warning("Unexpected document column_count: %s" % column_count)
self.epub.page_progression_direction = doc_style.pop("direction", "ltr")
self.default_font_family = doc_style.pop(
"font-family", DEFAULT_DOCUMENT_FONT_FAMILY
)
for default_name in DEFAULT_FONT_NAMES:
for font_family in self.split_font_family_value(self.default_font_family):
self.font_name_replacements[default_name] = font_family
self.default_font_size = doc_style.pop("font-size", DEFAULT_DOCUMENT_FONT_SIZE)
if self.default_font_size not in [
DEFAULT_DOCUMENT_FONT_SIZE,
DEFAULT_KC_COMIC_FONT_SIZE,
]:
log.warning("Unexpected document font-size: %s" % self.default_font_size)
self.default_line_height = doc_style.pop(
"line-height", DEFAULT_DOCUMENT_LINE_HEIGHT
)
if self.default_line_height != DEFAULT_DOCUMENT_LINE_HEIGHT:
log.warning(
"Unexpected document line-height: %s" % self.default_line_height
)
self.epub.writing_mode = doc_style.pop("writing-mode", "horizontal-tb")
if self.epub.writing_mode not in [
"horizontal-tb",
"vertical-lr",
"vertical-rl",
]:
log.warning("Unexpected document writing-mode: %s" % self.epub.writing_mode)
self.check_empty(doc_style.properties, "document data styles")
self.check_empty(document_data, "$538")
def process_content_features(self):
content_features = self.book_data.pop("$585", {})
for feature in content_features.pop("$590", []):
key = "%s/%s" % (feature.pop("$586", ""), feature.pop("$492", ""))
version_info = feature.pop("$589", {})
version = version_info.pop("version", {})
version.pop("$587", "")
version.pop("$588", "")
self.check_empty(version_info, "content_features %s version_info" % key)
self.check_empty(feature, "content_features %s feature" % key)
if content_features.pop("$598", content_features.pop("$155", "$585")) != "$585":
log.error("content_features id/kfx_id is incorrect")
self.check_empty(content_features, "$585")
def process_metadata(self):
self.cover_resource = None
book_metadata = self.book_data.pop("$490", {})
for categorised_metadata in book_metadata.pop("$491", []):
category = categorised_metadata.pop("$495")
for metadata in categorised_metadata.pop("$258"):
key = metadata.pop("$492")
self.process_metadata_item(category, key, metadata.pop("$307"))
self.check_empty(
metadata, "categorised_metadata %s/%s" % (category, key)
)
self.check_empty(categorised_metadata, "categorised_metadata %s" % category)
self.check_empty(book_metadata, "$490")
for key, value in self.book_data.pop("$258", {}).items():
self.process_metadata_item("", METADATA_NAMES.get(key, str(key)), value)
if self.epub.fixed_layout and not self.epub.is_print_replica:
self.epub.set_book_type("comic")
def process_metadata_item(self, category, key, value):
cat_key = "%s/%s" % (category, key) if category else key
if cat_key == "kindle_title_metadata/ASIN" or cat_key == "ASIN":
if not self.epub.asin:
self.epub.asin = value
elif cat_key == "kindle_title_metadata/author":
if value:
self.epub.authors.insert(0, value)
elif cat_key == "kindle_title_metadata/author_pronunciation":
if value:
self.epub.author_pronunciations.insert(0, value)
elif cat_key == "author":
if not self.epub.authors:
self.epub.authors = [a.strip() for a in value.split("&") if a]
elif (
cat_key == "kindle_title_metadata/cde_content_type"
or cat_key == "cde_content_type"
):
self.cde_content_type = value
if value == "MAGZ":
self.epub.set_book_type("magazine")
elif value == "EBSP":
self.epub.is_sample = True
elif cat_key == "kindle_title_metadata/description" or cat_key == "description":
self.epub.description = value.strip()
elif cat_key == "kindle_title_metadata/cover_image":
self.cover_resource = value
elif cat_key == "cover_image":
self.cover_resource = value
elif cat_key == "kindle_title_metadata/dictionary_lookup":
self.epub.is_dictionary = True
self.epub.source_language = value.pop("$474")
self.epub.target_language = value.pop("$163")
self.check_empty(value, "kindle_title_metadata/dictionary_lookup")
elif cat_key == "kindle_title_metadata/issue_date":
self.epub.pubdate = value
elif cat_key == "kindle_title_metadata/language" or cat_key == "language":
self.epub.language = self.fix_language(value)
elif cat_key == "kindle_title_metadata/publisher" or cat_key == "publisher":
self.epub.publisher = value.strip()
elif cat_key == "kindle_title_metadata/title" or cat_key == "title":
if not self.epub.title:
self.epub.title = value.strip()
elif cat_key == "kindle_title_metadata/title_pronunciation":
if not self.epub.title_pronunciation:
self.epub.title_pronunciation = value.strip()
elif cat_key == "kindle_ebook_metadata/book_orientation_lock":
if value != self.epub.orientation_lock:
log.error(
"Conflicting orientation lock values: %s, %s"
% (self.epub.orientation_lock, value)
)
self.epub.orientation_lock = value
elif cat_key == "kindle_title_metadata/is_dictionary":
self.epub.is_dictionary = value
elif cat_key == "kindle_title_metadata/is_sample":
self.epub.is_sample = value
elif cat_key == "kindle_title_metadata/override_kindle_font":
self.epub.override_kindle_font = value
elif cat_key == "kindle_capability_metadata/continuous_popup_progression":
self.epub.set_book_type("comic")
elif cat_key == "kindle_capability_metadata/yj_fixed_layout":
self.epub.fixed_layout = True
elif cat_key == "kindle_capability_metadata/yj_forced_continuous_scroll":
self.epub.scrolled_continuous = True
elif cat_key == "kindle_capability_metadata/yj_guided_view_native":
self.epub.guided_view_native = True
elif cat_key == "kindle_capability_metadata/yj_publisher_panels":
self.epub.set_book_type("comic")
self.epub.region_magnification = True
elif cat_key == "kindle_capability_metadata/yj_facing_page":
self.epub.set_book_type("comic")
elif cat_key == "kindle_capability_metadata/yj_double_page_spread":
self.epub.set_book_type("comic")
elif cat_key == "kindle_capability_metadata/yj_textbook":
self.epub.set_book_type("print replica")
elif cat_key == "kindle_capability_metadata/yj_illustrated_layout":
self.epub.illustrated_layout = self.epub.html_cover = True
elif cat_key == "reading_orders":
if not self.reading_orders:
self.reading_orders = value
elif cat_key == "support_landscape":
if value is False and self.epub.orientation_lock == "none":
self.epub.orientation_lock = "portrait"
elif cat_key == "support_portrait":
if value is False and self.epub.orientation_lock == "none":
self.epub.orientation_lock = "landscape"

View File

@@ -0,0 +1,810 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import operator
import re
from lxml import etree
from .epub_output import SVG, SVG_NAMESPACES, SVG_NS_URI, XLINK_NS_URI, qname
from .ion import IonSExp, IonStruct, IonSymbol, ion_type
from .ion_symbol_table import LocalSymbolTable
from .ion_text import IonText
from .message_logging import log
from .python_transition import IS_PYTHON2
from .utilities import get_url_filename, type_name, urlabspath, urlrelpath
from .yj_to_epub_properties import value_str
from .yj_versions import KNOWN_SUPPORTED_FEATURES
if IS_PYTHON2:
from .python_transition import repr, str, urllib
else:
import urllib.parse
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
DEVICE_SCREEN_NARROW_PX = 1200
DEVICE_SCREEN_WIDE_PX = 1920
RENDER_HTML_PLUGIN_AS = "iframe"
class KFX_EPUB_Misc(object):
def set_condition_operators(self):
if self.epub.orientation_lock == "landscape":
screen_width = DEVICE_SCREEN_WIDE_PX
screen_height = DEVICE_SCREEN_NARROW_PX
else:
screen_width = DEVICE_SCREEN_NARROW_PX
screen_height = DEVICE_SCREEN_WIDE_PX
self.condition_operators = {
"$305": (0, screen_height),
"$304": (0, screen_width),
"$300": (0, True),
"$301": (0, True),
"$183": (0, 0),
"$302": (0, screen_width),
"$303": (0, screen_height),
"$525": (0, (screen_width > screen_height)),
"$526": (0, (screen_width < screen_height)),
"$660": (0, True),
"$293": (1, operator.not_),
"$266": (1, None),
"$750": (1, None),
"$659": (None, None),
"$292": (2, operator.and_),
"$291": (2, operator.or_),
"$294": (2, operator.eq),
"$295": (2, operator.ne),
"$296": (2, operator.gt),
"$297": (2, operator.ge),
"$298": (2, operator.lt),
"$299": (2, operator.le),
"$516": (2, operator.add),
"$517": (2, operator.sub),
"$518": (2, operator.mul),
"$519": (2, operator.truediv),
}
def evaluate_binary_condition(self, condition):
value = self.evaluate_condition(condition)
if value not in {True, False}:
log.error(
"Condition has non-binary result (%s): %s"
% (str(value), str(condition))
)
return False
return value
def evaluate_condition(self, condition):
if ion_type(condition) is IonSExp:
op = condition[0]
num = len(condition) - 1
else:
op = condition
num = 0
if (ion_type(op) is not IonSymbol) or (op not in self.condition_operators):
log.error("Condition operator is unknown: %s" % str(condition))
return False
nargs, func = self.condition_operators[op]
if nargs is None:
if op == "$659":
if tuple(condition[1:]) in KNOWN_SUPPORTED_FEATURES:
return True
log.error("yj.supports feature unknown: %s" % repr(condition))
return False
if nargs != num:
log.error(
"Condition operator has wrong number of arguments: %s" % str(condition)
)
return False
if nargs == 0:
return func
if nargs == 1:
if op == "$266":
return 0
if op == "$750":
if condition[1] == "$752":
return True
if condition[1] == "$753":
return False
log.error("yj.layout_type unknown: %s" % condition[1])
return False
return func(self.evaluate_condition(condition[1]))
return func(
self.evaluate_condition(condition[1]), self.evaluate_condition(condition[2])
)
def add_svg_wrapper_to_block_image(
self, content_elem, book_part, fixed_height=0, fixed_width=0
):
if len(content_elem) != 1:
log.error(
"Incorrect div content for SVG wrapper: %s"
% etree.tostring(content_elem)
)
for image_div in content_elem.findall("*"):
if (
image_div.tag == "div"
and len(image_div) == 1
and image_div[0].tag == "img"
):
div_style = self.get_style(image_div)
div_style.pop("-kfx-style-name", "")
div_style.pop("font-size", "")
div_style.pop("line-height", "")
img = image_div[0]
img_style = self.get_style(img)
img_style.pop("-kfx-style-name", "")
img_style.pop("font-size", "")
img_style.pop("line-height", "")
iheight = img_style.pop("height", "")
iwidth = img_style.pop("width", "")
try:
img_file = self.epub.oebps_files[
get_url_filename(
urlabspath(img.get("src"), ref_from=book_part.filename)
)
]
img_height = img_file.height
img_width = img_file.width
except Exception as e:
print(f"Error {str(e)}")
return
orig_int_height = int_height = px_to_int(iheight)
orig_int_width = int_width = px_to_int(iwidth)
if (int_height and fixed_height and int_height != fixed_height) or (
int_width and fixed_width and int_width != fixed_width
):
log.error(
"Unexpected image style for SVG wrapper (fixed h=%d, w=%d): %s"
% (fixed_height, fixed_width, etree.tostring(image_div))
)
if int_height and int_width:
img_aspect = float(int_height) / float(int_width)
svg_aspect = float(img_height) / float(img_width)
if abs(img_aspect - svg_aspect) > 0.01:
log.error(
"Image (h=%d, w=%d) aspect ratio %f does not match SVG wrapper (h=%d, w=%d) %f"
% (
img_height,
img_width,
img_aspect,
int_height,
int_width,
svg_aspect,
)
)
else:
int_height = img_height
int_width = img_width
if not (
div_style.pop("text-align", "center") == "center"
and div_style.pop("text-indent", "0") == "0"
and img_style.pop("position", "absolute") == "absolute"
and img_style.pop("top", "0") == "0"
and img_style.pop("left", "0") == "0"
and (iheight == "" or orig_int_height)
and (
iwidth == ""
or orig_int_width
or re.match(r"^(100|9[5-9].*)%$", iwidth)
)
and len(img_style) == 0
and len(div_style) == 0
):
log.error(
"Unexpected image style for SVG wrapper (img h=%d, w=%d): %s"
% (img_height, img_width, etree.tostring(image_div))
)
image_div.remove(img)
svg = etree.SubElement(
image_div,
SVG,
nsmap=SVG_NAMESPACES,
attrib={
"version": "1.1",
"preserveAspectRatio": "xMidYMid meet",
"viewBox": "0 0 %d %d" % (int_width, int_height),
"height": "100%",
"width": "100%",
},
)
self.move_anchors(img, svg)
etree.SubElement(
svg,
qname(SVG_NS_URI, "image"),
attrib={
qname(XLINK_NS_URI, "href"): img.get("src"),
"height": "%d" % int_height,
"width": "%d" % int_width,
},
)
else:
log.error(
"Incorrect image content for SVG wrapper: %s"
% etree.tostring(image_div)
)
def horizontal_fxl_block_images(self, content_elem, book_part):
left = 0
for image_div in content_elem.findall("*"):
if (
image_div.tag == "div"
and len(image_div) == 1
and image_div[0].tag == "img"
):
img = image_div[0]
img_file = self.epub.oebps_files[
get_url_filename(
urlabspath(img.get("src"), ref_from=book_part.filename)
)
]
img_style = self.get_style(img)
if (
"position" in img_style
or "top" in img_style
or "left" in img_style
or "height" in img_style
or "width" in img_style
):
log.error(
"Unexpected image style for horizontal fxl: %s"
% etree.tostring(image_div)
)
img_style["position"] = "absolute"
img_style["top"] = value_str(0, "px")
img_style["left"] = value_str(left, "px")
img_style["height"] = value_str(img_file.height, "px")
img_style["width"] = value_str(img_file.width, "px")
self.set_style(img, img_style)
left += img_file.width
else:
log.error(
"Incorrect image content for horizontal fxl: %s"
% etree.tostring(image_div)
)
def process_kvg_shape(self, parent, shape, content_list, book_part, writing_mode):
shape_type = shape.pop("$159")
if shape_type == "$273":
elem = etree.SubElement(
parent,
qname(SVG_NS_URI, "path"),
attrib={"d": self.process_path(shape.pop("$249"))},
)
elif shape_type == "$270":
source = shape.pop("$474")
for i, content in enumerate(content_list):
if ion_type(content) is IonSymbol:
content = self.get_fragment(ftype="$608", fid=content)
if content.get("$155") == source or content.get("$598") == source:
break
else:
log.error("Missing KVG container content ID: %s" % source)
return
content_list.pop(i)
self.process_content(content, parent, book_part, writing_mode)
elem = parent[-1]
if elem.tag != "div":
log.error("Unexpected non-text content in KVG container: %s" % elem.tag)
return
elem.tag = qname(SVG_NS_URI, "text")
else:
log.error("Unexpected shape type: %s" % shape_type)
return
for yj_property_name, svg_attrib in [
("$70", "fill"),
("$72", "fill-opacity"),
("$75", "stroke"),
("$77", "stroke-linecap"),
("$529", "stroke-linejoin"),
("$530", "stroke-miterlimit"),
("$76", "stroke-width"),
("$98", "transform"),
]:
if yj_property_name in shape:
elem.set(
svg_attrib,
self.property_value(
yj_property_name, shape.pop(yj_property_name), svg=True
),
)
if "stroke" in elem.attrib and "fill" not in elem.attrib:
elem.set("fill", "none")
self.check_empty(shape, "shape")
def process_path(self, path):
if ion_type(path) is IonStruct:
path_bundle_name = path.pop("name")
path_index = path.pop("$403")
self.check_empty(path, "path")
if (
"$692" not in self.book_data
or path_bundle_name not in self.book_data["$692"]
):
log.error("Missing book path_bundle: %s" % path_bundle_name)
return ""
return self.process_path(
self.book_data["$692"][path_bundle_name]["$693"][path_index]
)
p = list(path)
d = []
def process_instruction(inst, n_args, pixels=True):
d.append(inst)
for j in range(n_args):
if len(p) == 0:
log.error("Incomplete path instruction in %s" % str(path))
return
v = p.pop(0)
if pixels:
v = self.adjust_pixel_value(v)
d.append(value_str(v))
while len(p) > 0:
inst = p.pop(0)
if inst == 0:
process_instruction("M", 2)
elif inst == 1:
process_instruction("L", 2)
elif inst == 2:
process_instruction("Q", 4)
elif inst == 3:
process_instruction("C", 6)
elif inst == 4:
process_instruction("Z", 0)
else:
log.error(
"Unexpected path instruction %s in %s" % (str(inst), str(path))
)
break
return " ".join(d)
def process_polygon(self, path):
def percent_value_str(v):
return value_str(v * 100, "%", emit_zero_unit=True)
d = []
i = 0
ln = len(path)
while i < ln:
inst = path[i]
if inst == 0 or inst == 1:
if i + 3 > ln:
log.error("Bad path instruction in %s" % str(path))
break
d.append(
"%s %s"
% (percent_value_str(path[i + 1]), percent_value_str(path[i + 2]))
)
i += 3
elif inst == 4:
i += 1
else:
log.error(
"Unexpected path instruction %s in %s" % (str(inst), str(path))
)
break
return "polygon(%s)" % (", ".join(d))
def process_transform(self, vals, svg):
if svg:
px = ""
sep = " "
else:
px = "px"
sep = ","
if len(vals) == 6:
vals[4] = self.adjust_pixel_value(vals[4])
vals[5] = self.adjust_pixel_value(vals[5])
if vals[4:6] == [0.0, 0.0]:
translate = ""
else:
translate = "translate(%s%s%s) " % (
value_str(vals[4], px),
sep,
value_str(vals[5], px),
)
if vals[0:4] == [1.0, 0.0, 0.0, 1.0] and translate:
return translate.strip()
if vals[1:3] == [0.0, 0.0]:
if vals[0] == vals[3]:
return translate + ("scale(%s)" % value_str(vals[0]))
return translate + (
"scale(%s%s%s)" % (value_str(vals[0]), sep, value_str(vals[3]))
)
if vals[0:4] == [0.0, 1.0, -1.0, 0.0]:
return translate + "rotate(-90deg)"
if vals[0:4] == [0.0, -1.0, 1.0, 0.0]:
return translate + "rotate(90deg)"
if vals[0:4] == [-1.0, 0.0, 0.0, -1.0]:
return translate + "rotate(180deg)"
log.error("Unexpected transform matrix: %s" % str(vals))
return "matrix(%s)" % (sep.join([value_str(v) for v in vals]))
log.error("Unexpected transform: %s" % str(vals))
return "?"
def process_plugin(
self, resource_name, alt_text, content_elem, book_part, is_html=False
):
res = self.process_external_resource(resource_name, save=False, is_plugin=True)
if is_html or res.mime == "plugin/kfx-html-article":
src = urlrelpath(
self.process_external_resource(
resource_name, is_plugin=True, save_referred=True
).filename,
ref_from=book_part.filename,
)
if RENDER_HTML_PLUGIN_AS == "iframe":
content_elem.tag = "iframe"
content_elem.set("src", src)
self.add_style(
content_elem,
{
"height": "100%",
"width": "100%",
"border-bottom-style": "none",
"border-left-style": "none",
"border-right-style": "none",
"border-top-style": "none",
},
)
elif RENDER_HTML_PLUGIN_AS == "object":
content_elem.tag = "object"
content_elem.set("data", src)
content_elem.set("type", "text/html")
self.add_style(
content_elem,
{
"height": "100%",
"width": "100%",
"border-bottom-style": "none",
"border-left-style": "none",
"border-right-style": "none",
"border-top-style": "none",
},
)
else:
content_elem.tag = "a"
content_elem.set("href", src)
content_elem.text = "[click here to read the content]"
elif res.format == "$284":
content_elem.tag = "img"
content_elem.set(
"src",
urlrelpath(
self.process_external_resource(resource_name).filename,
ref_from=book_part.filename,
),
)
content_elem.set("alt", alt_text)
else:
manifest_raw_media = res.raw_media.decode("utf-8")
manifest_symtab = LocalSymbolTable(
context="plugin %s" % resource_name, ignore_undef=True
)
try:
manifest_ = IonText(symtab=manifest_symtab).deserialize_annotated_value(
manifest_raw_media, import_symbols=None
)
except Exception:
log.error("Exception processing plugin %s" % resource_name)
raise
manifest_symtab.report()
plugin_type = manifest_.get_annotation()
manifest = manifest_.value
if plugin_type == "audio":
self.process_external_resource(
resource_name, save=False, is_plugin=True, process_referred=True
)
content_elem.tag = "audio"
content_elem.set("controls", "")
src = self.uri_reference(
manifest["facets"]["media"]["uri"], manifest_external_refs=True
)
content_elem.set("src", urlrelpath(src, ref_from=book_part.filename))
player = manifest["facets"]["player"]
for image_refs in ["play_images", "pause_images"]:
for uri in player.get(image_refs, []):
self.uri_reference(uri, save=False)
elif plugin_type == "button":
RENDER_BUTTON_PLUGIN = True
content_elem.tag = "div"
for image in manifest["facets"]["images"]:
if image["role"] != "upstate":
log.warning(
"Unknown button image role %s in %s"
% (image["role"], resource_name)
)
if RENDER_BUTTON_PLUGIN:
img = etree.SubElement(content_elem, "img")
img.set(
"src",
urlrelpath(
self.uri_reference(image["uri"]),
ref_from=book_part.filename,
),
)
img.set("alt", alt_text)
self.add_style(img, {"max-width": "100%"})
else:
self.uri_reference(image["uri"], save=False)
clicks = manifest["events"]["click"]
for click in clicks if isinstance(clicks, list) else [clicks]:
if click["name"] != "change_state":
log.warning(
"Unknown button event click name %s in %s"
% (click["name"], resource_name)
)
self.process_external_resource(
resource_name, is_plugin=True, save=False, process_referred=True
)
elif plugin_type == "hyperlink":
content_elem.tag = "a"
self.add_style(content_elem, {"height": "100%", "width": "100%"})
uri = manifest["facets"]["uri"]
if uri:
content_elem.set(
"href",
urlrelpath(
self.uri_reference(uri), ref_from=book_part.filename
),
)
elif plugin_type == "image_sequence":
content_elem.tag = "div"
for image in manifest["facets"]["images"]:
img = etree.SubElement(content_elem, "img")
img.set(
"src",
urlrelpath(
self.uri_reference(image["uri"]),
ref_from=book_part.filename,
),
)
img.set("alt", alt_text)
elif plugin_type in ["scrollable", "slideshow"]:
content_elem.tag = "div"
if manifest["properties"].get("initial_visibility") == "hide":
self.add_style(content_elem, {"visibility": "hidden"})
if "alt_text" in manifest["properties"]:
alt_text = manifest["properties"]["alt_text"]
for child in manifest["facets"]["children"]:
self.process_plugin_uri(
child["uri"], child["bounds"], content_elem, book_part
)
if plugin_type == "scrollable":
self.process_external_resource(
resource_name, is_plugin=True, save=False, process_referred=True
)
elif plugin_type == "video":
content_elem.tag = "video"
if manifest["properties"].get("user_interaction") == "enabled":
content_elem.set("controls", "")
if (
manifest.get("events", {}).get("enter_view", {}).get("name")
== "start"
):
content_elem.set("autoplay", "")
if (
manifest["properties"].get("play_context", {}).get("loop_count", 0)
< 0
):
content_elem.set("loop", "")
if "poster" in manifest["facets"]:
content_elem.set(
"poster",
urlrelpath(
self.uri_reference(manifest["facets"]["poster"]["uri"]),
ref_from=book_part.filename,
),
)
if "first_frame" in manifest["facets"]:
self.uri_reference(
manifest["facets"]["first_frame"]["uri"], save=False
)
alt_text = alt_text or "Cannot display %s content" % plugin_type
src = self.uri_reference(
manifest["facets"]["media"]["uri"], manifest_external_refs=True
)
content_elem.set("src", urlrelpath(src, ref_from=book_part.filename))
dummy_elem = etree.Element("dummy")
while len(content_elem) > 0:
e = content_elem[0]
content_elem.remove(e)
dummy_elem.append(e)
self.move_anchors(dummy_elem, content_elem)
elif plugin_type == "webview":
self.process_external_resource(
resource_name, is_plugin=True, save=False, save_referred=True
)
purl = urllib.parse.urlparse(manifest["facets"]["uri"])
if purl.scheme == "kfx":
self.process_plugin(
urllib.parse.unquote(purl.netloc + purl.path),
alt_text,
content_elem,
book_part,
is_html=True,
)
else:
log.error("Unexpected webview plugin URI scheme: %s" % uri)
elif plugin_type == "zoomable":
content_elem.tag = "img"
content_elem.set(
"src",
urlrelpath(
self.uri_reference(manifest["facets"]["media"]["uri"]),
ref_from=book_part.filename,
),
)
content_elem.set("alt", alt_text)
else:
log.error(
"Unknown plugin type %s in resource %s"
% (plugin_type, resource_name)
)
content_elem.tag = "object"
src = self.process_external_resource(
resource_name, is_plugin=True, save_referred=True
).filename
content_elem.set("data", urlrelpath(src, ref_from=book_part.filename))
content_elem.set("type", self.epub.oebps_files[src].mimetype)
if len(content_elem) == 0:
content_elem.text = (
alt_text or "Cannot display %s content" % plugin_type
)
def process_plugin_uri(self, uri, bounds, content_elem, book_part):
purl = urllib.parse.urlparse(uri)
if purl.scheme == "kfx":
child_elem = etree.SubElement(content_elem, "plugin-temp")
self.process_plugin(
urllib.parse.unquote(purl.netloc + purl.path), "", child_elem, book_part
)
self.process_bounds(child_elem, bounds)
else:
log.error("Unexpected plugin URI scheme: %s" % uri)
def process_bounds(self, elem, bounds):
for bound, property_name in [
("x", "left"),
("y", "top"),
("h", "height"),
("w", "width"),
]:
if bound in bounds:
bound_value = bounds[bound]
if ion_type(bound_value) is IonStruct:
unit = bound_value.pop("unit")
value = value_str(
bound_value.pop("value"), "%" if unit == "percent" else unit
)
self.check_empty(bound_value, "Bound %s value" % property_name)
self.add_style(elem, {property_name: value}, replace=True)
if bound in ["x", "y"]:
self.add_style(elem, {"position": "absolute"})
else:
log.error(
"Unexpected bound data type %s: %s"
% (type_name(bound), repr(bound))
)
def px_to_int(s):
m = re.match(r"^([0-9]+)(px)?$", s)
return int(m.group(1)) if m else 0

View File

@@ -0,0 +1,557 @@
from __future__ import absolute_import, division, print_function, unicode_literals
from .epub_output import TocEntry
from .message_logging import log
from .python_transition import IS_PYTHON2
from .utilities import make_unique_name, urlrelpath
from .yj_position_location import DEBUG_PAGES
from .yj_structure import APPROXIMATE_PAGE_LIST
if IS_PYTHON2:
from .python_transition import str, urllib
else:
import urllib.parse
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
KEEP_APPROX_PG_NUMS = False
REPORT_DUPLICATE_PAGES = False
PREVENT_DUPLICATE_PAGE_LABELS = False
PREVENT_DUPLICATE_PAGE_TARGETS = False
GUIDE_TYPE_OF_LANDMARK_TYPE = {
"$233": "cover",
"$396": "text",
"$269": "text",
"$212": "toc",
}
PERIODICAL_NCX_CLASSES = {
0: "section",
1: "article",
}
class KFX_EPUB_Navigation(object):
def process_navigation(self):
for section_nav in self.book_data.pop("$390", []):
section_name = section_nav.pop("$174")
for nav_container in section_nav.pop("$392", []):
self.nav_container_section[nav_container] = section_name
self.check_empty(section_nav, "section_navigation")
book_navigations = self.book_data.pop("$389", [])
for reading_order in self.reading_orders:
reading_order_name = reading_order.get("$178", "")
for i, book_navigation in enumerate(book_navigations):
if book_navigation.get("$178", "") == reading_order_name:
book_navigations.pop(i)
book_navigation.pop("$178", None)
for nav_container_ in book_navigation.pop("$392"):
nav_container = self.get_fragment(
ftype="$391", fid=nav_container_
)
self.process_nav_container(
nav_container, nav_container_, reading_order_name
)
self.check_empty(book_navigation, "book_navigation")
break
else:
log.warning(
'Failed to locate navigation for reading order "%s"'
% reading_order_name
)
self.check_empty(book_navigations, "book_navigation")
nav_container = self.book_data.pop("$391", {})
if not self.book.is_kpf_prepub:
self.check_empty(nav_container, "nav_container")
self.check_empty(self.book_data.pop("$394", {}), "conditional_nav_group_unit")
def process_nav_container(
self, nav_container, nav_container_name, reading_order_name
):
nav_container.pop("mkfx_id", None)
nav_container_name = nav_container.pop("$239", nav_container_name)
section_name = self.nav_container_section.get(nav_container_name)
nav_type = nav_container.pop("$235")
if nav_type not in {"$212", "$236", "$237", "$213", "$214"}:
log.error(
"nav_container %s has unknown type: %s" % (nav_container_name, nav_type)
)
if "imports" in nav_container:
for import_name in nav_container.pop("imports"):
self.process_nav_container(
self.book_data["$391"].pop(import_name),
nav_container_name,
reading_order_name,
)
else:
for nav_unit_ in nav_container.pop("$247"):
nav_unit = self.get_fragment(ftype="$393", fid=nav_unit_)
nav_unit.pop("mkfx_id", None)
if nav_type in {"$212", "$214", "$213"}:
self.process_nav_unit(
nav_type,
nav_unit,
self.epub.ncx_toc,
nav_container_name,
section_name,
)
elif nav_type == "$236":
label = self.get_representation(nav_unit)[0]
nav_unit_name = nav_unit.pop("$240", label)
target_position = self.get_position(nav_unit.pop("$246"))
landmark_type = nav_unit.pop("$238", None)
if landmark_type:
guide_type = GUIDE_TYPE_OF_LANDMARK_TYPE.get(landmark_type)
if guide_type is None:
log.warning("Unexpected landmark_type: %s" % landmark_type)
guide_type = landmark_type
if label == "cover-nav-unit":
label = ""
anchor_name = self.unique_anchor_name(
str(nav_unit_name) or guide_type
)
self.register_anchor(anchor_name, target_position)
self.epub.add_guide_entry(guide_type, label, anchor=anchor_name)
elif nav_type == "$237":
label = self.get_representation(nav_unit)[0]
nav_unit_name = nav_unit.pop("$240", "page_list_entry")
target_position = self.get_position(nav_unit.pop("$246"))
if nav_unit_name != "page_list_entry":
log.warning(
"Unexpected page_list nav_unit_name: %s" % nav_unit_name
)
if label and (
KEEP_APPROX_PG_NUMS
or DEBUG_PAGES
or nav_container_name != APPROXIMATE_PAGE_LIST
):
anchor_name = "page_%s" % label
if len(self.reading_orders) > 1:
anchor_name = "%s_%s" % (reading_order_name, anchor_name)
anchor_name = self.unique_anchor_name(anchor_name)
anchor_id = self.register_anchor(anchor_name, target_position)
if (
PREVENT_DUPLICATE_PAGE_TARGETS
and anchor_id in self.page_anchor_id_label
):
log.warning(
"Page %s is at the same position as page %s"
% (label, self.page_anchor_id_label[anchor_id])
)
else:
self.page_anchor_id_label[anchor_id] = label
if self.page_label_anchor_id.get(label) == anchor_id:
if (
REPORT_DUPLICATE_PAGES
and label not in self.reported_duplicate_page_label
):
log.warning(
"Page %s occurs multiple times with same position"
% label
)
self.reported_duplicate_page_label.add(label)
elif (
PREVENT_DUPLICATE_PAGE_LABELS
and len(self.reading_orders) == 1
):
log.warning(
"Page %s occurs multiple times with different positions"
% label
)
else:
self.page_label_anchor_id[label] = anchor_id
self.epub.add_pagemap_entry(label, anchor=anchor_name)
self.check_empty(
nav_unit, "nav_container %s nav_unit" % nav_container_name
)
self.check_empty(nav_container, "nav_container %s" % nav_container_name)
def process_nav_unit(
self, nav_type, nav_unit, ncx_toc, nav_container_name, section_name
):
label, icon = self.get_representation(nav_unit)
if label:
label = label.strip()
description = nav_unit.pop("$154", None)
if description:
description = description.strip()
nav_unit_name = nav_unit.pop("$240", label)
nav_unit.pop("mkfx_id", None)
nested_toc = []
for entry in nav_unit.pop("$247", []):
nested_nav_unit = self.get_fragment(ftype="$393", fid=entry)
self.process_nav_unit(
nav_type, nested_nav_unit, nested_toc, nav_container_name, section_name
)
for entry_set in nav_unit.pop("$248", []):
for entry in entry_set.pop("$247", []):
nested_nav_unit = self.get_fragment(ftype="$393", fid=entry)
self.process_nav_unit(
nav_type,
nested_nav_unit,
nested_toc,
nav_container_name,
section_name,
)
orientation = entry_set.pop("$215")
if orientation == "$386":
if self.epub.orientation_lock != "landscape":
nested_toc = []
elif orientation == "$385":
if self.epub.orientation_lock == "landscape":
nested_toc = []
else:
log.error("Unknown entry set orientation: %s" % orientation)
if section_name and nav_type == "$214":
for i, entry in enumerate(nested_toc):
self.navto_anchor[(section_name, float(i))] = entry.anchor
self.check_empty(
entry_set,
"nav_container %s %s entry_set" % (nav_container_name, nav_type),
)
if "$246" in nav_unit:
anchor_name = "toc%d_%s" % (self.toc_entry_count, nav_unit_name)
self.toc_entry_count += 1
target_position = self.get_position(nav_unit.pop("$246"))
self.register_anchor(anchor_name, target_position)
else:
anchor_name = None
if (not label) and (not anchor_name):
ncx_toc.extend(nested_toc)
else:
ncx_toc.append(
TocEntry(
label,
anchor=anchor_name,
children=nested_toc,
description=description,
icon=self.process_external_resource(icon).filename
if icon
else None,
)
)
self.check_empty(
nav_unit, "nav_container %s %s nav_unit" % (nav_container_name, nav_type)
)
def unique_anchor_name(self, anchor_name):
if anchor_name and anchor_name not in self.anchor_positions:
return anchor_name
count = 0
while True:
new_anchor_name = "%s:%d" % (anchor_name, count)
if new_anchor_name not in self.anchor_positions:
return new_anchor_name
count += 1
def process_anchors(self):
anchors = self.book_data.pop("$266", {})
for anchor_name, anchor in anchors.items():
self.check_fragment_name(anchor, "$266", anchor_name)
if "$186" in anchor:
self.anchor_uri[str(anchor_name)] = anchor.pop("$186")
elif "$183" in anchor:
self.register_anchor(
str(anchor_name), self.get_position(anchor.pop("$183"))
)
anchor.pop("$597", None)
self.check_empty(anchor, "anchor %s" % anchor_name)
def get_position(self, position):
id = self.get_location_id(position)
offset = position.pop("$143", 0)
self.check_empty(position, "position")
return (id, offset)
def get_representation(self, entry):
label = ""
icon = None
if "$241" in entry:
representation = entry.pop("$241")
if "$245" in representation:
icon = representation.pop("$245")
self.process_external_resource(icon)
label = str(icon)
if "$244" in representation:
label = representation.pop("$244")
self.check_empty(representation, "nav_container representation")
return (label, icon)
def position_str(self, position):
return "%s.%d" % position
def register_anchor(self, anchor_name, position):
if self.DEBUG:
log.debug(
"register_anchor %s = %s" % (anchor_name, self.position_str(position))
)
if not anchor_name:
raise Exception(
"register_anchor: anchor name is missing for position %s"
% self.position_str(position)
)
if anchor_name not in self.anchor_positions:
self.anchor_positions[anchor_name] = set()
self.anchor_positions[anchor_name].add(position)
eid, offset = position
if eid not in self.position_anchors:
self.position_anchors[eid] = {}
if offset not in self.position_anchors[eid]:
self.position_anchors[eid][offset] = []
if anchor_name not in self.position_anchors[eid][offset]:
self.position_anchors[eid][offset].append(anchor_name)
return self.get_anchor_id(self.position_anchors[eid][offset][0])
def register_link_id(self, eid, kind):
return self.register_anchor("%s_%s" % (kind, eid), (eid, 0))
def get_anchor_id(self, anchor_name):
if anchor_name not in self.anchor_id:
self.anchor_id[anchor_name] = new_id = make_unique_name(
self.fix_html_id(anchor_name), self.anchor_ids
)
self.anchor_ids.add(new_id)
return self.anchor_id[anchor_name]
def process_position(self, eid, offset, elem):
if self.DEBUG:
log.debug("process position %s" % self.position_str((eid, offset)))
if eid in self.position_anchors:
if offset in self.position_anchors[eid]:
if self.DEBUG:
log.debug("at registered position")
if not elem.get("id", ""):
elem_id = self.get_anchor_id(self.position_anchors[eid][offset][0])
elem.set("id", elem_id)
if self.DEBUG:
log.debug(
"set element id %s for position %s"
% (elem_id, self.position_str((eid, offset)))
)
anchor_names = self.position_anchors[eid].pop(offset)
for anchor_name in anchor_names:
self.anchor_elem[anchor_name] = elem
if len(self.position_anchors[eid]) == 0:
self.position_anchors.pop(eid)
return anchor_names
return []
def move_anchor(self, old_elem, new_elem):
for anchor_name, elem in self.anchor_elem.items():
if elem is old_elem:
self.anchor_elem[anchor_name] = new_elem
if "id" in old_elem.attrib:
new_elem.set("id", old_elem.attrib.pop("id"))
def move_anchors(self, old_root, target_elem):
for anchor_name, elem in self.anchor_elem.items():
if root_element(elem) is old_root:
self.anchor_elem[anchor_name] = target_elem
if "id" in old_root.attrib and "id" not in target_elem.attrib:
target_elem.set("id", old_root.get("id"))
def get_anchor_uri(self, anchor_name):
self.used_anchors.add(anchor_name)
if anchor_name in self.anchor_uri:
return self.anchor_uri[anchor_name]
positions = self.anchor_positions.get(anchor_name, [])
log.error(
"Failed to locate uri for anchor: %s (position: %s)"
% (
anchor_name,
", ".join([self.position_str(p) for p in sorted(positions)]),
)
)
return "/MISSING_ANCHOR#" + anchor_name
def report_duplicate_anchors(self):
for anchor_name, positions in self.anchor_positions.items():
if (anchor_name in self.used_anchors) and (len(positions) > 1):
log.error(
"Anchor %s has multiple positions: %s"
% (
anchor_name,
", ".join([self.position_str(p) for p in sorted(positions)]),
)
)
def anchor_as_uri(self, anchor):
return "anchor:" + anchor
def anchor_from_uri(self, uri):
return uri[7:]
def id_of_anchor(self, anchor, filename):
url = self.get_anchor_uri(anchor)
purl = urllib.parse.urlparse(url)
if purl.path != filename or not purl.fragment:
log.error("anchor %s in file %s links to %s" % (anchor, filename, url))
return purl.fragment
def fixup_anchors_and_hrefs(self):
for anchor_name, elem in self.anchor_elem.items():
root = root_element(elem)
for book_part in self.epub.book_parts:
if book_part.html is root:
elem_id = elem.get("id", "")
if not elem_id:
elem_id = self.get_anchor_id(str(anchor_name))
elem.set("id", elem_id)
self.anchor_uri[anchor_name] = "%s#%s" % (
urllib.parse.quote(book_part.filename),
elem_id,
)
break
else:
log.error(
"Failed to locate element within book parts for anchor %s"
% anchor_name
)
self.anchor_elem = None
for book_part in self.epub.book_parts:
body = book_part.body()
for e in body.iter("*"):
if "id" in e.attrib and not visible_elements_before(e):
uri = book_part.filename + "#" + e.get("id")
if self.DEBUG:
log.debug("no visible element before %s" % uri)
for anchor, a_uri in self.anchor_uri.items():
if (a_uri == uri) and (anchor not in self.immovable_anchors):
self.anchor_uri[anchor] = urllib.parse.quote(
book_part.filename
)
if self.DEBUG:
log.debug(" moved anchor %s" % anchor)
for book_part in self.epub.book_parts:
body = book_part.body()
for e in body.iter("*"):
if e.tag == "a" and e.get("href", "").startswith("anchor:"):
e.set(
"href",
urlrelpath(
self.get_anchor_uri(
self.anchor_from_uri(e.attrib.pop("href"))
),
ref_from=book_part.filename,
),
)
for g in self.epub.guide:
g.target = self.get_anchor_uri(g.anchor)
for p in self.epub.pagemap:
p.target = self.get_anchor_uri(p.anchor)
def resolve_toc_target(ncx_toc):
for toc_entry in ncx_toc:
if toc_entry.anchor:
toc_entry.target = self.get_anchor_uri(toc_entry.anchor)
if toc_entry.children:
resolve_toc_target(toc_entry.children)
resolve_toc_target(self.epub.ncx_toc)
def root_element(elem):
while elem.getparent() is not None:
elem = elem.getparent()
return elem
def visible_elements_before(elem, root=None):
if root is None:
root = elem
while root.tag != "body":
root = root.getparent()
if elem is root:
return False
for e in root.iterfind(".//*"):
if e is elem:
break
if e.tag in ["img", "br", "hr", "li", "ol", "ul"] or e.text or e.tail:
return True
return False

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,580 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import io
import posixpath
import re
from PIL import Image
from .message_logging import log
from .python_transition import IS_PYTHON2
from .utilities import (
EXTS_OF_MIMETYPE,
RESOURCE_TYPE_OF_EXT,
convert_jxr_to_tiff,
convert_pdf_to_jpeg,
disable_debug_log,
font_file_ext,
image_file_ext,
root_filename,
urlrelpath,
)
from .yj_structure import SYMBOL_FORMATS
if IS_PYTHON2:
from .python_transition import repr, urllib
else:
import urllib.parse
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
USE_HIGHEST_RESOLUTION_IMAGE_VARIANT = True
FIX_PDF = True
FIX_JPEG_XR = True
JXR_TO_JPEG_QUALITY = 90
MIN_JPEG_QUALITY = 80
MAX_JPEG_QUALITY = 100
TILE_SIZE_REPORT_PERCENTAGE = 10
class Obj(object):
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
class KFX_EPUB_Resources(object):
def get_external_resource(self, resource_name, ignore_variants=False):
resource_obj = self.resource_cache.get(resource_name)
if resource_obj is not None:
return resource_obj
resource = self.get_fragment(ftype="$164", fid=resource_name)
if resource.pop("$175", "") != resource_name:
raise Exception("Name of resource %s is incorrect" % resource_name)
format = resource.pop("$161", None)
if format in SYMBOL_FORMATS:
extension = "." + SYMBOL_FORMATS[format]
elif format is not None:
log.error("Resource %s has unknown format: %s" % (resource_name, format))
extension = ".bin"
fixed_height = resource.pop("$67", None)
fixed_width = resource.pop("$66", None)
resource_height = resource.pop("$423", None) or fixed_height
resource_width = resource.pop("$422", None) or fixed_width
if "$636" in resource:
tile_height = resource.pop("$638")
tile_width = resource.pop("$637")
tile_padding = resource.pop("$797", 0)
with disable_debug_log():
full_image = Image.new("RGB", (resource_width, resource_height))
separate_tiles_size = tile_count = 0
col = resource.pop("$636")
for y, row in enumerate(col):
top_padding = 0 if y == 0 else tile_padding
bottom_padding = (
resource_height - tile_height * len(col)
if y == len(col) - 1
else tile_padding
)
for x, location in enumerate(row):
left_padding = 0 if x == 0 else tile_padding
right_padding = (
resource_width - tile_width * len(row)
if x == len(row) - 1
else tile_padding
)
tile_raw_media = self.locate_raw_media(location)
if tile_raw_media is not None:
tile_count += 1
separate_tiles_size += len(tile_raw_media)
tile = Image.open(io.BytesIO(tile_raw_media))
twidth, theight = tile.size
if (
twidth != tile_width + left_padding + right_padding
or theight != tile_height + top_padding + bottom_padding
):
log.error(
"Resource %s tile %d, %d size (%d, %d) does not have expected padding %d of (%d, %d) for %s"
% (
resource_name,
x,
y,
twidth,
theight,
tile_padding,
tile_width,
tile_height,
resource_name,
)
)
crop = (
left_padding,
top_padding,
tile_width + left_padding,
tile_height + top_padding,
)
tile = tile.crop(crop)
full_image.paste(tile, (x * tile_width, y * tile_height))
tile.close()
if full_image.size != (resource_width, resource_height):
log.error(
"Resource %s combined tiled image size is (%d, %d) but should be (%d, %d) for %s"
% (
resource_name,
full_image.size[0],
full_image.size[1],
resource_width,
resource_height,
resource_name,
)
)
min_quality = MIN_JPEG_QUALITY
max_quality = MAX_JPEG_QUALITY
best_size_diff = best_quality = raw_media = None
while True:
quality = (max_quality + min_quality) // 2
outfile = io.BytesIO()
full_image.save(
outfile,
"jpeg" if extension == ".jpg" else extension[1:],
quality=quality,
)
test_raw_media = outfile.getvalue()
outfile.close()
size_diff = abs(separate_tiles_size - len(test_raw_media))
if best_size_diff is None or size_diff < best_size_diff:
best_size_diff = size_diff
best_quality = quality
raw_media = test_raw_media
if separate_tiles_size > len(test_raw_media):
min_quality = quality + 1
else:
max_quality = quality - 1
if max_quality < min_quality:
break
if (
best_size_diff * 100
) // separate_tiles_size > TILE_SIZE_REPORT_PERCENTAGE or True:
log.warning(
"Image resource %s has %d tiles with total size %d combined into image of size %d quality %d"
% (
resource_name,
tile_count,
separate_tiles_size,
len(raw_media),
best_quality,
)
)
location = location.partition("-tile")[0]
else:
location = resource.pop("$165")
search_path = resource.pop("$166", location)
if search_path != location:
log.error(
"Image resource %s has location %s != search_path %s"
% (resource_name, location, search_path)
)
raw_media = self.locate_raw_media(location)
mime = resource.pop("$162", None)
if mime in EXTS_OF_MIMETYPE:
if extension == ".pobject" or extension == ".bin":
if mime == "figure":
extension = image_file_ext(raw_media)
else:
extension = EXTS_OF_MIMETYPE[mime][0]
elif mime is not None:
log.error(
"Resource %s has unknown mime type: %s" % (resource_name, repr(mime))
)
location_fn = location
location_fn = resource.pop(
"yj.conversion.source_resource_filename", location_fn
)
location_fn = resource.pop("yj.authoring.source_file_name", location_fn)
if (extension == ".pobject" or extension == ".bin") and "." in location_fn:
extension = "." + location_fn.rpartition(".")[2]
if not location_fn.endswith(extension):
location_fn = location_fn.partition(".")[0] + extension
resource.pop("$597", None)
resource.pop("$57", None)
resource.pop("$56", None)
resource.pop("$499", None)
resource.pop("$500", None)
resource.pop("$137", None)
resource.pop("$136", None)
referred_resources = resource.pop("$167", [])
if "$214" in resource:
self.process_external_resource(resource.pop("$214"), save=False)
if FIX_JPEG_XR and (format == "$548") and (raw_media is not None):
try:
tiff_data = convert_jxr_to_tiff(raw_media, location_fn)
except Exception as e:
log.error(
"Exception during conversion of JPEG-XR '%s' to TIFF: %s"
% (location_fn, repr(e))
)
else:
with disable_debug_log():
img = Image.open(io.BytesIO(tiff_data))
ofmt, extension = (
("PNG", ".png") if img.mode == "RGBA" else ("JPEG", ".jpg")
)
outfile = io.BytesIO()
img.save(outfile, ofmt, quality=JXR_TO_JPEG_QUALITY)
img.close()
raw_media = outfile.getvalue()
outfile.close()
location_fn = location_fn.rpartition(".")[0] + extension
suffix = ""
if (
FIX_PDF
and format == "$565"
and raw_media is not None
and "$564" in resource
):
page_num = resource["$564"] + 1
try:
jpeg_data = convert_pdf_to_jpeg(
raw_media, page_num, reported_errors=self.reported_pdf_errors
)
except Exception as e:
log.error(
'Exception during conversion of PDF "%s" page %d to JPEG: %s'
% (location_fn, page_num, repr(e))
)
else:
raw_media = jpeg_data
extension = ".jpg"
location_fn = location_fn.rpartition(".")[0] + extension
suffix = "-page%d" % page_num
resource.pop("$564")
filename = self.resource_location_filename(
location_fn, suffix, self.epub.IMAGE_FILEPATH
)
if not ignore_variants:
for rr in resource.pop("$635", []):
variant = self.get_external_resource(rr, ignore_variants=True)
if (
USE_HIGHEST_RESOLUTION_IMAGE_VARIANT
and variant is not None
and variant.width > resource_width
and variant.height > resource_height
):
if self.DEBUG:
log.info(
"Replacing image %s (%dx%d) with variant %s (%dx%d)"
% (
filename,
resource_width,
resource_height,
variant.filename,
variant.width,
variant.height,
)
)
raw_media, filename, resource_width, resource_height = (
variant.raw_media,
variant.filename,
variant.width,
variant.height,
)
if "$564" in resource:
filename += "#page=%d" % (resource.pop("$564") + 1)
self.check_empty(resource, "resource %s" % resource_name)
resource_obj = self.resource_cache[resource_name] = Obj(
raw_media=raw_media,
filename=filename,
extension=extension,
format=format,
mime=mime,
location=location,
width=resource_width,
height=resource_height,
referred_resources=referred_resources,
manifest_entry=None,
)
return resource_obj
def process_external_resource(
self,
resource_name,
save=True,
process_referred=False,
save_referred=False,
is_plugin=False,
is_referred=False,
):
resource_obj = self.get_external_resource(resource_name)
if (
save
and self.save_resources
and resource_obj.raw_media
is not None
is resource_obj.manifest_entry
is None
):
filename = (
root_filename(resource_obj.location)
if is_referred
else resource_obj.filename
)
filename, fragment_sep, fragment = filename.partition("#")
base_filename = filename
cnt = 0
while filename in self.epub.oebps_files:
if (
self.epub.oebps_files[filename].binary_data
== resource_obj.raw_media
):
resource_obj.manifest_entry = self.epub.manifest_files[filename]
break
if is_referred and cnt == 0:
log.error(
"Multiple referred resources exist with location %s"
% resource_obj.location
)
fn, ext = posixpath.splitext(base_filename)
filename = "%s_%d%s" % (fn, cnt, ext)
cnt += 1
else:
resource_obj.manifest_entry = self.epub.manifest_resource(
filename,
data=resource_obj.raw_media,
height=resource_obj.height,
width=resource_obj.width,
mimetype=resource_obj.mime if is_referred else None,
)
resource_obj.filename = filename + fragment_sep + fragment
resource_obj.is_saved = True
if process_referred or save_referred:
for rr in resource_obj.referred_resources:
self.process_external_resource(rr, save=save_referred, is_referred=True)
if is_referred:
pass
elif is_plugin and resource_obj.format not in ["$287", "$284"]:
log.error(
"Unexpected plugin resource format %s for %s"
% (resource_obj.format, resource_name)
)
elif (not is_plugin) and resource_obj.extension == ".pobject":
log.error(
"Unexpected non-plugin resource format %s for %s"
% (resource_obj.extension, resource_name)
)
return resource_obj
def locate_raw_media(self, location, report_missing=True):
try:
raw_media = self.book_data["$417"][location]
self.used_raw_media.add(location)
except Exception:
if report_missing:
log.error("Missing bcRawMedia %s" % location)
raw_media = None
return raw_media
def resource_location_filename(self, location, suffix, filepath_template):
if (location, suffix) in self.location_filenames:
return self.location_filenames[(location, suffix)]
if location.startswith("/"):
location = "_" + location[1:]
safe_location = re.sub(r"[^A-Za-z0-9_/.-]", "_", location)
safe_location = safe_location.replace("//", "/x/")
path, sep, name = safe_location.rpartition("/")
path += sep
root, sep, ext = name.rpartition(".")
ext = sep + ext
resource_type = RESOURCE_TYPE_OF_EXT.get(ext, "resource")
unique_part = self.unique_part_of_local_symbol(root)
root = self.prefix_unique_part_of_symbol(unique_part, resource_type)
for prefix in ["resource/", filepath_template[1:].partition("/")[0] + "/"]:
if path.startswith(prefix):
path = path[len(prefix) :]
safe_filename = filepath_template % ("%s%s%s%s" % (path, root, suffix, ext))
unique_count = 0
oebps_files_lower = set([n.lower() for n in self.epub.oebps_files.keys()])
while safe_filename.lower() in oebps_files_lower:
safe_filename = filepath_template % (
"%s%s%s-%d%s" % (path, root, suffix, unique_count, ext)
)
unique_count += 1
self.location_filenames[(location, suffix)] = safe_filename
return safe_filename
def process_fonts(self):
fonts = self.book_data.pop("$262", {})
raw_fonts = self.book_data.pop("$418", {})
raw_media = self.book_data.get("$417", {})
used_fonts = {}
for font in fonts.values():
location = font.pop("$165")
if location in used_fonts:
font["src"] = 'url("%s")' % urllib.parse.quote(
urlrelpath(
used_fonts[location], ref_from=self.epub.STYLES_CSS_FILEPATH
)
)
elif location in raw_fonts or (
self.book.is_kpf_prepub and location in raw_media
):
raw_font = raw_fonts.pop(location, None) or raw_media.pop(location)
filename = location
if "." not in filename:
ext = font_file_ext(raw_font)
if not ext:
log.error(
"Font %s has unknown type (possibly obfuscated)" % filename
)
ext = ".font"
filename = "%s%s" % (filename, ext)
filename = self.resource_location_filename(
filename, "", self.epub.FONT_FILEPATH
)
if filename not in self.epub.oebps_files:
self.epub.manifest_resource(filename, data=raw_font)
font["src"] = 'url("%s")' % urlrelpath(
urllib.parse.quote(filename), ref_from=self.epub.STYLES_CSS_FILEPATH
)
used_fonts[location] = filename
else:
log.error("Missing bcRawFont %s" % location)
for prop in ["$15", "$12", "$13"]:
if prop in font and font[prop] == "$350":
font.pop(prop)
self.fix_font_name(font["$11"], add=True)
self.font_faces.append(self.convert_yj_properties(font))
for location in raw_fonts:
log.warning("Unused font file: %s" % location)
filename = self.resource_location_filename(
location, "", self.epub.FONT_FILEPATH
)
self.epub.manifest_resource(filename, data=raw_fonts[location])
def uri_reference(
self, uri, save=True, save_referred=None, manifest_external_refs=False
):
purl = urllib.parse.urlparse(uri)
if purl.scheme == "kfx":
return self.process_external_resource(
urllib.parse.unquote(purl.netloc + purl.path),
is_plugin=None,
save=save,
save_referred=save_referred,
).filename
if purl.scheme in ["navto", "navt"]:
anchor = self.navto_anchor.get(
(
urllib.parse.unquote(purl.netloc),
float(purl.fragment) if purl.fragment else 0.0,
)
)
if anchor is not None:
return self.anchor_as_uri(anchor)
else:
log.error("Failed to locate anchor for %s" % uri)
return "/MISSING_NAVTO#%s_%s" % (
urllib.parse.unquote(purl.netloc),
purl.fragment,
)
if purl.scheme in ["http", "https"]:
if manifest_external_refs:
self.epub.manifest_resource(uri, external=True, report_dupe=False)
return uri
if purl.scheme != "mailto":
log.error("Unexpected URI scheme: %s" % uri)
return uri
def unique_file_id(self, filename):
if filename in self.file_ids:
return self.file_ids[filename]
id = re.sub(r"[^A-Za-z0-9.-]", "_", filename.rpartition("/")[2][:64])
if not re.match(r"^[A-Za-z]", id[0]):
id = "id_" + id
if id in self.file_ids.values():
base_id = id
unique_count = 0
while id in self.file_ids.values():
id = "%s_%d" % (base_id, unique_count)
unique_count += 1
self.file_ids[filename] = id
return id

View File

@@ -0,0 +1,288 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import io
from PIL import Image
try:
import PyPDF2
except ImportError:
try:
from . import PyPDF2
except ImportError:
PyPDF2 = None
from .ion import (
IonAnnotation,
IonList,
IonSExp,
IonString,
IonStruct,
IonSymbol,
ion_type,
)
from .message_logging import log
from .utilities import convert_jxr_to_tiff, disable_debug_log, list_symbols
from .yj_container import YJFragmentKey
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
class ImageResource(object):
def __init__(self, location, image_format, data):
self.location = location
self.image_format = image_format
self.data = data
class KFX_PDF(object):
def __init__(self, book):
self.book = book
def extract_pdf_resources(self):
ordered_pdfs = self.get_ordered_images(["$565"])
if len(ordered_pdfs) == 0:
pdf_data = None
elif len(ordered_pdfs) == 1:
pdf_data = ordered_pdfs[0].data
elif PyPDF2 is None:
log.error("PyPDF2 package is missing. Unable to combine PDF resources")
pdf_data = None
else:
try:
merger = PyPDF2.PdfFileMerger()
for single_pdf in ordered_pdfs:
merger.append(fileobj=io.BytesIO(single_pdf.data))
merged_file = io.BytesIO()
merger.write(merged_file)
pdf_data = merged_file.getvalue()
merged_file.close()
except Exception as e:
log.error(repr(e))
pdf_data = None
if pdf_data is not None:
log.info(
"Combined %d PDF resources into a single file" % len(ordered_pdfs)
)
return pdf_data
def convert_image_resources(self):
ordered_images = self.get_ordered_images(
["$286", "$285", "$548", "$284"],
include_unreferenced=False,
allow_duplicates=True,
)
return convert_images_to_pdf_data(ordered_images)
def get_ordered_images(
self, formats, include_unreferenced=True, allow_duplicates=False
):
image_resource_location = {}
image_resources = {}
for fragment in self.book.fragments.get_all("$164"):
resource = fragment.value
resource_format = resource.get("$161")
if resource_format in formats:
location = resource.get("$165")
if location is not None and location not in image_resources:
raw_media = self.book.fragments.get(ftype="$417", fid=location)
if raw_media is not None:
image_resource_location[fragment.fid] = location
image_resources[location] = ImageResource(
location, resource_format, raw_media.value
)
ordered_images = []
unused_image_resource_ids = set(image_resources.keys())
for fid in self.collect_image_references(allow_duplicates):
location = image_resource_location.get(fid)
image_resource = image_resources.get(location)
if image_resource is not None:
ordered_images.append(image_resource)
unused_image_resource_ids.discard(location)
if unused_image_resource_ids and include_unreferenced:
log.error(
"Found unreferenced resources: %s"
% list_symbols(unused_image_resource_ids)
)
for fid in unused_image_resource_ids:
ordered_images.append(image_resources[fid])
return ordered_images
def collect_image_references(self, allow_duplicates=False):
processed_story_names = set()
ordered_image_resources = []
def collect_section_info(section_name):
pending_story_names = []
section_image_resources = set()
def walk_content(data, content_key):
data_type = ion_type(data)
if data_type is IonAnnotation:
walk_content(data.value, content_key)
elif data_type is IonList:
for i, fc in enumerate(data):
if (
content_key in {"$146", "$274"}
and self.book.is_kpf_prepub
and ion_type(fc) is IonSymbol
):
fc = self.book.fragments[
YJFragmentKey(ftype="$608", fid=fc)
]
walk_content(fc, content_key)
elif data_type is IonSExp:
for fc in data:
walk_content(fc, content_key)
elif data_type is IonStruct:
annot_type = data.get("$687")
typ = data.get("$159")
if typ == "$271":
resource_name = data.get("$175")
if (
resource_name is not None
and resource_name not in section_image_resources
):
section_image_resources.add(resource_name)
if (
allow_duplicates
or resource_name not in ordered_image_resources
):
ordered_image_resources.append(resource_name)
if "$141" in data:
for pt in data["$141"]:
if isinstance(pt, IonAnnotation):
pt = pt.value
walk_content(pt, "$141")
if "$683" in data:
walk_content(data["$683"], "$683")
if "$749" in data:
walk_content(
self.book.fragments[
YJFragmentKey(ftype="$259", fid=data["$749"])
],
"$259",
)
if "$146" in data:
walk_content(data["$146"], "$274" if typ == "$274" else "$146")
if "$145" in data and annot_type not in ["$584", "$690"]:
fv = data["$145"]
if ion_type(fv) is not IonStruct:
walk_content(fv, "$145")
if "$176" in data and content_key != "$259":
fv = data["$176"]
if self.book.is_conditional_structure:
if fv not in pending_story_names:
pending_story_names.append(fv)
else:
if fv not in processed_story_names:
walk_content(
self.book.fragments[
YJFragmentKey(ftype="$259", fid=fv)
],
"$259",
)
processed_story_names.add(fv)
for fk, fv in data.items():
if ion_type(fv) != IonString and fk not in {
"$749",
"$584",
"$683",
"$145",
"$146",
"$141",
"$702",
"$250",
"$176",
"yj.dictionary.term",
"yj.dictionary.unnormalized_term",
}:
walk_content(fv, fk)
walk_content(
self.book.fragments[YJFragmentKey(ftype="$260", fid=section_name)],
"$260",
)
for story_name in pending_story_names:
if story_name not in processed_story_names:
walk_content(
self.book.fragments[
YJFragmentKey(ftype="$259", fid=story_name)
],
"$259",
)
processed_story_names.add(story_name)
for section_name in self.book.ordered_section_names():
collect_section_info(section_name)
return ordered_image_resources
def convert_images_to_pdf_data(ordered_images):
if len(ordered_images) == 0:
pdf_data = None
else:
image_list = []
for image_resource in ordered_images:
image_data = image_resource.data
if image_resource.image_format == "$548":
try:
image_data = convert_jxr_to_tiff(
image_data, image_resource.location
)
except Exception as e:
log.error(
"Exception during conversion of JPEG-XR '%s' to TIFF: %s"
% (image_resource.location, repr(e))
)
with disable_debug_log():
image = Image.open(io.BytesIO(image_data))
image = image.convert("RGB")
image_list.append(image)
first_image = image_list.pop(0)
pdf_file = io.BytesIO()
with disable_debug_log():
first_image.save(pdf_file, "pdf", save_all=True, append_images=image_list)
for image in image_list:
image.close()
first_image.close()
pdf_data = pdf_file.getvalue()
pdf_file.close()
return pdf_data

View File

@@ -0,0 +1,676 @@
#!/usr/bin/python
# -*- coding: utf8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = "GPL v3"
__copyright__ = "2016-2022, John Howell <jhowell@acm.org>"
ANY = None
TF = {False, True}
PACKAGE_VERSION_PLACEHOLDERS = {
"PackageVersion:YJReaderSDK-1.0.x.x GitSHA:c805492 Month-Day:04-22",
"PackageVersion:YJReaderSDK-1.0.x.x GitSHA:[33mc805492[m Month-Day:04-22",
"kfxlib-00000000",
}
KNOWN_KFX_GENERATORS = {
("2.16", "PackageVersion:YJReaderSDK-1.0.824.0 Month-Day:04-09"),
("3.41.1.0", "PackageVersion:YJReaderSDK-1.0.1962.11 Month-Day:10-17"),
("3.42.1.0", "PackageVersion:YJReaderSDK-1.0.2044.4 Month-Day:10-28"),
("6.11.1.2", "PackageVersion:YJReaderSDK-1.0.2467.43 Month-Day:07-05"),
("6.11.1.2", "PackageVersion:YJReaderSDK-1.0.2467.8 Month-Day:07-14"),
("6.11.1.2", "PackageVersion:YJReaderSDK-1.0.2539.3 Month-Day:03-17"),
("6.20.1.0", "PackageVersion:YJReaderSDK-1.0.2685.4 Month-Day:05-19"),
("6.24.1.0", "PackageVersion:YJReaderSDK-1.1.67.2 Month-Day:06-18"),
("6.28.1.0", "PackageVersion:YJReaderSDK-1.1.67.4 Month-Day:07-14"),
("6.28.2.0", "PackageVersion:YJReaderSDK-1.1.147.0 Month-Day:09-10"),
("7.38.1.0", "PackageVersion:YJReaderSDK-1.2.173.0 Month-Day:09-20"),
("7.45.1.0", "PackageVersion:YJReaderSDK-1.4.23.0 Month-Day:11-23"),
("7.58.1.0", "PackageVersion:YJReaderSDK-1.5.116.0 Month-Day:02-25"),
("7.66.1.0", "PackageVersion:YJReaderSDK-1.5.185.0 Month-Day:04-13"),
("7.66.1.0", "PackageVersion:YJReaderSDK-1.5.195.0 Month-Day:04-20"),
("7.91.1.0", "PackageVersion:YJReaderSDK-1.5.566.6 Month-Day:11-03"),
("7.91.1.0", "PackageVersion:YJReaderSDK-1.5.595.1 Month-Day:11-30"),
("7.111.1.1", "PackageVersion:YJReaderSDK-1.6.444.0 Month-Day:02-27"),
("7.111.1.1", "PackageVersion:YJReaderSDK-1.6.444.5 Month-Day:03-20"),
("7.121.3.0", "PackageVersion:YJReaderSDK-1.6.444.18 Month-Day:05-02"),
("7.125.1.0", "PackageVersion:YJReaderSDK-1.6.444.24 Month-Day:06-01"),
("7.125.1.0", "PackageVersion:YJReaderSDK-1.6.444.33 Month-Day:06-16"),
("7.131.2.0", "PackageVersion:YJReaderSDK-1.6.444.36 Month-Day:07-10"),
("7.135.2.0", "PackageVersion:YJReaderSDK-1.6.1034.2 Month-Day:08-23"),
("7.135.2.0", "PackageVersion:YJReaderSDK-1.6.1034.13 Month-Day:10-09"),
("7.135.2.0", "PackageVersion:YJReaderSDK-1.6.1034.17 Month-Day:11-06"),
("7.149.1.0", "PackageVersion:YJReaderSDK-1.6.1034.59 Month-Day:12-06"),
("7.149.1.0", "PackageVersion:YJReaderSDK-1.6.1034.62 Month-Day:12-21"),
("7.149.1.0", "PackageVersion:YJReaderSDK-1.6.1034.72 Month-Day:01-04"),
("7.149.1.0", "PackageVersion:YJReaderSDK-1.6.1871.0 Month-Day:01-23"),
("7.149.1.0", "PackageVersion:YJReaderSDK-1.6.1938.0 Month-Day:01-29"),
("7.149.1.0", "PackageVersion:YJReaderSDK-1.6.2071.0 Month-Day:02-12"),
("7.149.1.0", "PackageVersion:YJReaderSDK-1.6.200363.0 Month-Day:03-19"),
("7.153.1.0", ""),
("7.165.1.1", ""),
("7.168.1.0", ""),
("7.171.1.0", ""),
("7.174.1.0", ""),
("7.177.1.0", ""),
("7.180.1.0", ""),
("7.182.1.0", ""),
("7.188.1.0", ""),
("7.191.1.0", ""),
("7.213.1.0", ""),
("7.220.2.0", ""),
("7.228.1.0", ""),
("7.232.1.0", ""),
("7.236.1.0", ""),
("20.12.238.0", ""),
}
GENERIC_CREATOR_VERSIONS = {
("YJConversionTools", "2.15.0"),
("KTC", "1.0.11.1"),
("", ""),
}
KNOWN_FEATURES = {
"symbols": {
"max_id": {
489,
609,
620,
626,
627,
634,
652,
662,
667,
668,
673,
681,
693,
695,
696,
697,
700,
701,
705,
716,
748,
753,
754,
755,
759,
761,
777,
779,
783,
785,
786,
787,
789,
797,
804,
825,
},
},
"format_capabilities": {
"kfxgen.pidMapWithOffset": {1},
"kfxgen.positionMaps": {2},
"kfxgen.textBlock": {1},
"db.schema": {1},
},
"SDK.Marker": {
"CanonicalFormat": {
1,
2,
},
},
"com.amazon.yjconversion": {
"ar-reflow-language": {
1,
},
"cn-reflow-language": {
1,
},
"indic-reflow-language": {
1,
},
"jp-reflow-language": {
1,
},
"jpvertical-reflow-language": {
2,
3,
4,
5,
6,
7,
},
"reflow-language": {
2,
3,
},
"reflow-language-expansion": {
1,
},
"tcn-reflow-language": {
1,
},
"multiple_reading_orders-switchable": {
1,
},
"reflow-section-size": ANY,
"reflow-style": {
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
(2147483646, 2147483647),
(2147483647, 2147483647),
},
"yj_audio": {
1,
2,
},
"yj_custom_word_iterator": {
1,
},
"yj_dictionary": {
1,
2,
},
"yj_double_page_spread": {
1,
},
"yj_facing_page": {
1,
},
"yj_fixed_layout": {
1,
},
"yj_graphical_highlights": {
1,
},
"yj_hdv": {
1,
2,
},
"yj_interactive_image": {
1,
},
"yj_jpegxr_sd": {
1,
},
"yj_jpg_rst_marker_present": {
1,
},
"yj_mathml": {
1,
},
"yj_mixed_writing_mode": {
1,
2,
},
"yj_non_pdf_fixed_layout": {
2,
},
"yj_pdf_links": {
1,
},
"yj_pdf_support": {
1,
},
"yj_publisher_panels": {
2,
},
"yj_rotated_pages": {
1,
},
"yj_ruby": {
1,
},
"yj_table": {
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
},
"yj_table_viewer": {
1,
2,
},
"yj_textbook": {
1,
},
"yj_thumbnails_present": {
1,
},
"yj_vertical_text_shadow": {
1,
},
"yj_video": {
1,
},
"yj.conditional_structure": {
1,
},
"yj.illustrated_layout": {
1,
},
},
}
KNOWN_SUPPORTED_FEATURES = {
("$660",),
("$751",),
("$664", "crop_bleed", 1),
}
KNOWN_METADATA = {
"book_navigation": {
"pages": ANY,
},
"kindle_audit_metadata": {
"file_creator": {
"YJConversionTools",
"FLYP",
"KTC",
"KC",
"KPR",
},
"creator_version": {
"2.15.0",
"0.1.24.0",
"0.1.26.0",
"2.0.0.1",
"1.0.11.1",
"1.3.0.0",
"1.5.14.0",
"1.8.1.0",
"1.9.2.0",
"1.11.399.0",
"1.11.539.0",
"1.12.11.0",
"1.13.7.0",
"1.13.10.0",
"0.93.187.0",
"0.94.32.0",
"0.95.8.0",
"0.96.4.0",
"0.96.40.0",
"0.97.79.3",
"0.98.260.0",
"0.98.315.0",
"0.99.28.0",
"0.101.1.0",
"0.102.0.0",
"0.103.0.0",
"1.0.319.0",
"1.1.58.0",
"1.2.83.0",
"1.3.30.0",
"1.4.200067.0",
"1.5.60.0",
"1.6.97.0",
"1.7.223.0",
"1.8.50.0",
"1.9.52.0",
"1.10.214.0",
"1.11.576.0",
"1.12.39.0",
"1.14.112.0",
"1.15.20.0",
"1.16.2.0",
"1.18.0.0",
"1.20.1.0",
"1.21.6.0",
"1.22.13.0",
"1.23.0.0",
"1.24.33.0",
"1.25.34.0",
"1.26.14.0",
"1.27.14.0",
"1.28.12.0",
"1.29.17.0",
"1.30.4.0",
"1.31.0.0",
"1.32.1.0",
"1.33.3.0",
"1.34.20.0",
"1.35.210.0",
"1.35.618.0",
"1.35.770.0",
"1.36.1.0",
"1.36.20.0",
"1.37.2.0",
"1.38.0.0",
"1.38.37.0",
"1.39.30.0",
"1.40.6.0",
"1.41.10.0",
"1.42.2.0",
"1.42.6.0",
"1.43.0.0",
"1.44.13.0",
"1.45.20.0",
"1.46.2.0",
"1.47.1.0",
"1.48.7.0",
"1.49.0.0",
"1.50.0.0",
"1.51.1.0",
"1.52.2.0",
"1.52.4.0",
"1.52.6.0",
"1.53.1.0",
"1.54.0.0",
"1.55.0.0",
"1.56.0.0",
"1.57.0.0",
"1.58.0.0",
"1.59.0.0",
"1.60.0.0",
"1.60.1.0",
"1.60.2.0",
"1.61.0.0",
"1.62.0.0",
"1.62.1.0",
"1.63.0.0",
"3.0.0",
"3.1.0",
"3.2.0",
"3.3.0",
"3.4.0",
"3.5.0",
"3.6.0",
"3.7.0",
"3.7.1",
"3.8.0",
"3.9.0",
"3.10.0",
"3.10.1",
"3.11.0",
"3.12.0",
"3.13.0",
"3.14.0",
"3.15.0",
"3.16.0",
"3.17.0",
"3.17.1",
"3.20.0",
"3.20.1",
"3.21.0",
"3.22.0",
"3.23.0",
"3.24.0",
"3.25.0",
"3.26.0",
"3.27.0",
"3.28.0",
"3.28.1",
"3.29.0",
"3.29.1",
"3.29.2",
"3.30.0",
"3.31.0",
"3.32.0",
"3.33.0",
"3.34.0",
"3.35.0",
"3.36.0",
"3.36.1",
"3.37.0",
"3.38.0",
"3.39.0",
"3.39.1",
"3.40.0",
"3.41.0",
"3.42.0",
"3.43.0",
"3.44.0",
"3.45.0",
"3.46.0",
"3.47.0",
"3.48.0",
"3.49.0",
"3.50.0",
"3.51.0",
"3.52.0",
"3.52.1",
"3.53.0",
"3.54.0",
"3.55.0",
"3.56.0",
"3.56.1",
"3.57.0",
"3.57.1",
"3.58.0",
"3.59.0",
"3.59.1",
"3.60.0",
"3.61.0",
},
},
"kindle_capability_metadata": {
"continuous_popup_progression": {
0,
},
"graphical_highlights": {1},
"yj_double_page_spread": {1},
"yj_facing_page": {1},
"yj_fixed_layout": {1},
"yj_has_animations": {1},
"yj_illustrated_layout": {1},
"yj_publisher_panels": {1},
"yj_textbook": {1},
},
"kindle_ebook_metadata": {
"book_orientation_lock": {"landscape", "portrait", "none"},
"multipage_selection": {"disabled"},
"nested_span": {"enabled"},
"selection": {"enabled"},
"user_visible_labeling": {"page_exclusive"},
},
"kindle_title_metadata": {
"cde_content_type": {
"EBOK",
"EBSP",
"MAGZ",
"PDOC",
},
"ASIN": ANY,
"asset_id": ANY,
"author": ANY,
"author_pronunciation": ANY,
"book_id": ANY,
"content_id": ANY,
"cover_image": ANY,
"description": ANY,
"dictionary_lookup": ANY,
"editionVersion": ANY,
"imprint_pronunciation": ANY,
"is_dictionary": {True},
"is_sample": TF,
"issue_date": ANY,
"itemType": {"MAGZ"},
"language": ANY,
"override_kindle_font": TF,
"parent_asin": ANY,
"periodicals_generation_V2": {"true"},
"publisher": ANY,
"title": ANY,
"title_pronunciation": ANY,
"updateTime": ANY,
},
"metadata": {
"ASIN": ANY,
"asset_id": ANY,
"author": ANY,
"binding_direction": {"binding_direction_left"},
"cde_content_type": {
"EBOK",
"MAGZ",
"PDOC",
},
"cover_image": ANY,
"cover_page": ANY,
"doc_sym_publication_id": ANY,
"description": ANY,
"issue_date": ANY,
"language": ANY,
"orientation": {"portrait", "landscape"},
"parent_asin": ANY,
"publisher": ANY,
"reading_orders": ANY,
"support_landscape": TF,
"support_portrait": TF,
"target_NarrowDimension": ANY,
"target_WideDimension": ANY,
"title": ANY,
"version": {1.0},
"volume_label": ANY,
},
}
KNOWN_AUXILIARY_METADATA = {
"ANCHOR_REFERRED_BY_CONTAINERS": ANY,
"auxData_resource_list": ANY,
"base_line": ANY,
"button_type": {1},
"checkbox_state": ANY,
"dropDown_count": ANY,
"filename.opf": ANY,
"has_large_data_table": TF,
"IsSymNameBased": TF,
"IS_TARGET_SECTION": {True},
"kSectionContainsAVI": {True},
"links_extracted": {True},
"link_from_text": TF,
"location": ANY,
"mime": {"Audio", "Figure", "Video"},
"ModifiedContentInfo": ANY,
"modified_time": ANY,
"most-common-computed-style": ANY,
"namespace": {"KindleConversion"},
"num-dual-covers-removed": {1},
"page_rotation": {0, 1},
"plugin_group_list": ANY,
"resizable_plugin": TF,
"resource_stream": ANY,
"size": ANY,
"SourceIdContentInfo": ANY,
"target": ANY,
"text_baseline": ANY,
"text_ext": {1},
"type": {"resource"},
"yj.dictionary.first_head_word": ANY,
"yj.dictionary.inflection_rules": ANY,
}
KNOWN_KCB_DATA = {
"book_state": {
"book_input_type": [
0,
1,
2,
3,
4,
6,
7,
],
"book_reading_direction": [
0,
2,
],
"book_target_type": [
1,
2,
3,
],
},
"content_hash": {},
"metadata": {
"book_path": ANY,
"edited_tool_versions": KNOWN_METADATA["kindle_audit_metadata"][
"creator_version"
],
"format": ["yj"],
"global_styling": TF,
"id": ANY,
"log_path": ANY,
"platform": ["mac", "win"],
"quality_report": ANY,
"source_path": ANY,
"tool_name": ["KC", "KPR", "KTC", "Kindle Previewer 3"],
"tool_version": KNOWN_METADATA["kindle_audit_metadata"]["creator_version"],
},
"tool_data": {
"cache_path": ANY,
"created_on": ANY,
"last_modified_time": ANY,
"link_extract_choice": TF,
"link_notification_preference": TF,
},
}
def is_known_generator(kfxgen_application_version, kfxgen_package_version):
if (
kfxgen_application_version == ""
or kfxgen_application_version.startswith("kfxlib")
or kfxgen_application_version.startswith("KC")
or kfxgen_application_version.startswith("KPR")
):
return True
if kfxgen_package_version in PACKAGE_VERSION_PLACEHOLDERS:
kfxgen_package_version = ""
return (kfxgen_application_version, kfxgen_package_version) in KNOWN_KFX_GENERATORS
def is_known_feature(cat, key, val):
vals = KNOWN_FEATURES.get(cat, {}).get(key, [])
return vals is ANY or val in vals
def is_known_metadata(cat, key, val):
vals = KNOWN_METADATA.get(cat, {}).get(key, [])
return vals is ANY or val in vals
def is_known_aux_metadata(key, val):
vals = KNOWN_AUXILIARY_METADATA.get(key, [])
return vals is ANY or val in vals
def is_known_kcb_data(cat, key, val):
vals = KNOWN_KCB_DATA.get(cat, {}).get(key, [])
return vals is ANY or val in vals

View File

@@ -1,4 +1,5 @@
import re
from kindle_download_helper.config import GITHUB_README_COMMENTS

105
no_cli.py Normal file
View File

@@ -0,0 +1,105 @@
import argparse
import os
import time
from kindle_download_helper.config import (
DEFAULT_OUT_DEDRM_DIR,
DEFAULT_OUT_DIR,
DEFAULT_OUT_EPUB_DIR,
)
from kindle_download_helper.no_kindle import NoKindle
def no_main():
parser = argparse.ArgumentParser()
parser.add_argument(
"-e",
"--email",
help="amazon login email",
)
parser.add_argument(
"-p",
"--password",
help="amazon login password",
)
parser.add_argument(
"--com",
dest="domain",
action="store_const",
const="uk",
default="cn",
help="if your account is an amazon.co.uk account",
)
parser.add_argument(
"--cn",
dest="domain",
action="store_const",
const="cn",
default="cn",
help="if your account is an amazon.cn account",
)
parser.add_argument(
"--jp",
dest="domain",
action="store_const",
const="co.jp",
default="cn",
help="if your account is an amazon.co.jp account",
)
parser.add_argument(
"--de",
dest="domain",
action="store_const",
const="de",
default="cn",
help="if your account is an amazon.de account",
)
parser.add_argument(
"--uk",
dest="domain",
action="store_const",
const="uk",
default="cn",
help="if your account is an amazon.co.uk account",
)
parser.add_argument(
"-o", "--outdir", default=DEFAULT_OUT_DIR, help="dwonload output dir"
)
parser.add_argument(
"-od",
"--outdedrmdir",
default=DEFAULT_OUT_DEDRM_DIR,
help="dwonload output dedrm dir",
)
parser.add_argument(
"-oe",
"--outepubmdir",
default=DEFAULT_OUT_EPUB_DIR,
help="dwonload output epub dir",
)
options = parser.parse_args()
if options.email is None or options.password is None:
raise Exception("Please provide email and password")
if not os.path.exists(options.outdir):
os.makedirs(options.outdir)
# for epub
if not os.path.exists(options.outepubmdir):
os.makedirs(options.outepubmdir)
nk = NoKindle(options.email, options.password, options.domain)
nk.make_library()
for e in nk.ebooks:
try:
nk.download_book(e["ASIN"])
except Exception as e:
import traceback
traceback.print_exc()
print(e)
# spider rule
time.sleep(1)
if __name__ == "__main__":
no_main()