/*
8 * Copyright (C) 2022 Authors
10 * Released under GNU GPL v2+, read the file
'COPYING' for more information.
15#include <poppler/UTF.h>
18#include <poppler/GfxFont.h>
19#include <poppler/GfxState.h>
20#include <poppler/PDFDoc.h>
21#include <poppler/PDFDocEncoding.h>
39 return Geom::Affine(ctm[0], ctm[1], ctm[2], ctm[3], ctm[4], ctm[5]);
44 std::cout <<
"C:" <<
label <<
":" << ctm[0] <<
"," << ctm[1] <<
"," << ctm[2] <<
"," << ctm[3] <<
"," << ctm[4]
45 <<
"," << ctm[5] <<
"\n";
50 std::cout <<
"A:" <<
label <<
":" << ctm[0] <<
"," << ctm[1] <<
"," << ctm[2] <<
"," << ctm[3] <<
"," << ctm[4]
51 <<
"," << ctm[5] <<
"\n";
69 fonts.resize(fontDict->getLength());
70 for (std::size_t i = 0; i <
fonts.size(); ++i) {
71 const Object &obj1 = fontDict->getValNF(i);
72 Object obj2 = obj1.fetch(xref);
76 }
else if (fontDictRef) {
79 r.gen = 100000 + fontDictRef->num;
88 fonts[i] = GfxFont::makeFont(xref, fontDict->getKey(i), r, obj2.getDict());
93 error(errSyntaxError, -1,
"font resource is not a dictionary");
101 for (
const auto &font :
fonts) {
102 if (font && font->matches(tag)) {
113 FNVHash() { h = 2166136261U; }
121 void hash(
const char *p,
int n)
124 for (i = 0; i <
n; ++i) {
129 int get31() {
return (h ^ (h >> 31)) & 0x7fffffff; }
150 switch (obj->getType()) {
153 h->hash(obj->getBool() ? 1 : 0);
158 h->hash((
char *)&n,
sizeof(int));
163 h->hash((
char *)&r,
sizeof(double));
167 s = obj->getString();
168 h->hash(s->c_str(), s->getLength());
173 h->hash(p, (
int)strlen(p));
180 n = obj->arrayGetLength();
181 h->hash((
char *)&n,
sizeof(int));
182 for (i = 0; i < n; ++i) {
183 const Object &obj2 = obj->arrayGetNF(i);
189 n = obj->dictGetLength();
190 h->hash((
char *)&n,
sizeof(
int));
191 for (i = 0; i < n; ++i) {
192 p = obj->dictGetKey(i);
193 h->hash(p, (
int)strlen(p));
194 const Object &obj2 = obj->dictGetValNF(i);
203 n = obj->getRefNum();
204 h->hash((
char *)&n,
sizeof(
int));
205 n = obj->getRefGen();
206 h->hash((
char *)&n,
sizeof(
int));
216 if (!font->getName())
219 std::string tagname = font->getName()->c_str();
221 for (i = 0; i < tagname.size(); ++i) {
222 if (tagname[i] <
'A' || tagname[i] >
'Z') {
226 if (i != 6 || tagname.size() <= 7 || tagname[6] !=
'+')
228 return tagname.substr(7);
241 style = font->isItalic() ?
"italic" :
"";
245 switch (font->getWeight()) {
246 case GfxFont::WeightNotDefined:
255 weight = std::to_string(font->getWeight() * 100);
261 switch (font->getStretch()) {
262 case GfxFont::UltraCondensed:
265 case GfxFont::ExtraCondensed:
268 case GfxFont::Condensed:
271 case GfxFont::SemiCondensed:
274 case GfxFont::Normal:
277 case GfxFont::SemiExpanded:
280 case GfxFont::Expanded:
283 case GfxFont::ExtraExpanded:
286 case GfxFont::UltraExpanded:
297 if (!desc && font->getFamily()) {
300 std::string pdf_family =
validateString(font->getFamily()->c_str());
301 std::string desc_str = pdf_family;
302 auto pos =
name.find(
"-");
303 if (pos != std::string::npos) {
305 std::stringstream ret;
306 auto str =
name.substr(pos + 1,
name.size());
308 if (l >=
'A' && l <=
'Z')
312 desc_str = desc_str + ret.str();
314 desc = pango_font_description_from_string(desc_str.c_str());
317 desc = pango_font_description_from_string(pdf_family.c_str());
323 auto new_family = pango_font_description_get_family(desc);
328 switch (pango_font_description_get_style(desc)) {
329 case PANGO_STYLE_ITALIC:
332 case PANGO_STYLE_OBLIQUE:
338 auto pw = pango_font_description_get_weight(desc);
339 if (pw != PANGO_WEIGHT_NORMAL) {
340 weight = std::to_string(pw);
344 switch (pango_font_description_get_stretch(desc)) {
345 case PANGO_STRETCH_ULTRA_CONDENSED:
348 case PANGO_STRETCH_EXTRA_CONDENSED:
351 case PANGO_STRETCH_CONDENSED:
354 case PANGO_STRETCH_SEMI_CONDENSED:
357 case PANGO_STRETCH_SEMI_EXPANDED:
360 case PANGO_STRETCH_EXPANDED:
363 case PANGO_STRETCH_EXTRA_EXPANDED:
366 case PANGO_STRETCH_ULTRA_EXPANDED:
381 std::string source =
name;
382 transform(source.begin(), source.end(), source.begin(), ::tolower);
383 source.erase(std::remove_if(source.begin(), source.end(), ::isspace), source.end());
384 auto contains = [=](
const std::string &other) {
return source.find(other) != std::string::npos; };
386 if (contains(
"italic") || contains(
"slanted")) {
388 }
else if (contains(
"oblique")) {
393 static std::map<std::string, std::string> weights{
396 {
"ultrabold",
"800"},
397 {
"extrabold",
"800"},
401 {
"ultralight",
"200"},
402 {
"extralight",
"200"},
408 {
"regular",
"normal"},
410 {
"normal",
"normal"},
414 for (
auto w : weights) {
415 if (contains(
w.first))
419 static std::map<std::string, std::string> stretches{
421 {
"ultracondensed",
"ultra-condensed"},
422 {
"extracondensed",
"extra-condensed"},
423 {
"semicondensed",
"semi-condensed"},
424 {
"condensed",
"condensed"},
425 {
"ultraexpanded",
"ultra-expanded"},
426 {
"extraexpanded",
"extra-expanded"},
427 {
"semiexpanded",
"semi-expanded"},
428 {
"expanded",
"expanded"},
432 for (
auto s : stretches) {
433 if (contains(s.first))
449 auto new_family = pango_font_description_get_family(desc);
464 char *copyAsString = pango_font_description_to_string(desc);
465 std::string pangoString = copyAsString;
466 g_free(copyAsString);
483 std::set<int> &visitedObjects,
int page)
486 auto xref = pdf_doc->getXRef();
489 const Object &obj1 = resources->lookupNF(
"Font");
491 Object obj2 = obj1.fetch(xref);
493 auto r = obj1.getRef();
494 fontDict =
new InkFontDict(xref, &r, obj2.getDict());
496 }
else if (obj1.isDict()) {
497 fontDict =
new InkFontDict(xref,
nullptr, obj1.getDict());
501 for (
int i = 0; i < fontDict->
getNumFonts(); ++i) {
502 auto font = fontDict->
getFont(i);
503 if (fontsList->find(font) == fontsList->end()) {
505 fontsList->emplace(font,
FontData(font));
507 fontsList->at(font).pages.insert(
page);
512 const char *resTypes[] = {
"XObject",
"Pattern"};
513 for (
const char *resType : resTypes) {
514 Object objDict = resources->lookup(resType);
515 if (!objDict.isDict())
518 for (
int i = 0; i < objDict.dictGetLength(); ++i) {
520 const Object obj2 = objDict.getDict()->getVal(i, &obj2Ref);
521 if (obj2Ref != Ref::INVALID() && !visitedObjects.insert(obj2Ref.num).second)
524 if (!obj2.isStream())
528 const Object resObj = obj2.streamGetDict()->lookup(
"Resources", &resourcesRef);
529 if (resourcesRef != Ref::INVALID() && !visitedObjects.insert(resourcesRef.num).second)
532 if (resObj.isDict() && resObj.getDict() != resources) {
541 auto fontsList = std::make_shared<std::map<FontPtr, FontData>>();
542 auto count = pdf_doc->getCatalog()->getNumPages();
543 std::set<int> visitedObjects;
545 for (
auto page_num = 1; page_num <= count; page_num++) {
546 auto page = pdf_doc->getCatalog()->getPage(page_num);
547 auto resources =
page->getResourceDict();
581 if (isalpha(in[0]) && std::find_if_not(in.begin(), in.end(), isalnum) == in.end()) {
591 std::ostringstream outStream;
592 for (
char chr : in) {
597 outStream << std::hex << ((
unsigned int)chr & 0xff);
600 return outStream.str();
610 if (g_utf8_validate(in.c_str(), -1,
nullptr)) {
613 g_warning(
"Couldn't parse strings in the PDF, there may be errors.");
622 Object obj = dict->lookup(
key);
624 if (!obj.isString()) {
630std::string
getString(
const std::unique_ptr<GooString> &value)
645 if (_POPPLER_HAS_UNICODE_BOM(value)) {
646 str = g_convert(value->getCString () + 2, value->getLength () - 2,
647 "UTF-8",
"UTF-16BE", NULL, NULL, NULL);
648 }
else if (_POPPLER_HAS_UNICODE_BOMLE(value)) {
649 str = g_convert(value->getCString () + 2, value->getLength () - 2,
650 "UTF-8",
"UTF-16LE", NULL, NULL, NULL);
652#if POPPLER_CHECK_VERSION(25,02,0)
653 else if (
auto utf16 = pdfDocEncodingToUTF16(value->toStr()); !utf16.empty()) {
654 str = g_convert(utf16.c_str(), utf16.length(),
"UTF-8",
"UTF-16", NULL, NULL, NULL);
657 else if (
auto utf16 = pdfDocEncodingToUTF16(value->toStr(), &stringLength)) {
658 str = g_convert(utf16, stringLength,
"UTF-8",
"UTF-16", NULL, NULL, NULL);
663 std::string copy = str;
667 g_warning(
"Couldn't parse text in PDF from UTF16.");
675 std::cout <<
"[ ... ]";
679 for (
int i = 0; i < array->getLength(); ++i) {
680 for (
int x = depth; x > -1; x--)
682 std::cout << i <<
": ";
683 Object obj = array->get(i);
687 for (
int x = depth; x > 0; x--)
695 std::cout <<
"{ ... }";
699 for (
auto j = 0; j < dict->getLength(); j++) {
700 auto key = dict->getKey(j);
701 auto val = dict->getVal(j);
702 for (
int x = depth; x > -1; x--)
704 std::cout <<
key <<
": ";
708 for (
int x = depth; x > 0; x--)
716 std::cout <<
" > REF(" << obj->getRef().num <<
"):";
718 auto ref = obj->fetch(xref);
721 }
else if (obj->isDict()) {
723 }
else if (obj->isArray()) {
725 }
else if (obj->isString()) {
726 std::cout <<
" STR '" << obj->getString()->getCString() <<
"'";
727 }
else if (obj->isName()) {
728 std::cout <<
" NAME '" << obj->getName() <<
"'";
729 }
else if (obj->isBool()) {
730 std::cout <<
" BOOL " << (obj->getBool() ?
"true" :
"false");
731 }
else if (obj->isNum()) {
732 std::cout <<
" NUM " << obj->getNum();
734 std::cout <<
" > ? " << obj->getType() <<
"";
_PangoFontDescription PangoFontDescription
3x3 affine transformation matrix.
FontData(FontPtr font)
Extract all the useful information from the GfxFont object.
std::string getSpecification() const
std::string getSubstitute() const
PangoFontDescription * parsePostscriptName(std::string const &name, bool substitute)
Use font config to parse the postscript name found in pdf/ps files and return font config family and ...
3x3 matrix representing an affine transformation.
FontPtr lookup(const char *tag) const
FontPtr getFont(int i) const
void hashFontObject1(const Object *obj, FNVHash *h)
std::vector< FontPtr > fonts
int hashFontObject(Object *obj)
InkFontDict(XRef *xref, Ref *fontDictRef, Dict *fontDict)
static FontFactory & get(Args &&... args)
TODO: insert short description here.
Affine identity()
Create an identity matrix.
static cairo_user_data_key_t key
Geom::Affine stateToAffine(GfxState *state)
Get the default transformation state from the GfxState.
void ctmout(const char *label, const double *ctm)
FontList getPdfFonts(std::shared_ptr< PDFDoc > pdf_doc)
std::string getString(const std::unique_ptr< GooString > &value)
void pdf_debug_array(const Array *array, int depth, XRef *xref)
std::string getNameWithoutSubsetTag(FontPtr font)
std::string sanitizeId(std::string const &in)
Convert arbitrary string (e.g.
void pdf_debug_dict(const Dict *dict, int depth, XRef *xref)
std::string validateString(std::string const &in)
Ensure string is valid UTF8.
void _getFontsRecursive(std::shared_ptr< PDFDoc > pdf_doc, Dict *resources, const FontList &fontsList, std::set< int > &visitedObjects, int page)
std::string getDictString(Dict *dict, const char *key)
Get a string from a dictionary.
Geom::Affine ctmToAffine(const double *ctm)
Convert a transformation matrix to a lib2geom affine object.
void affout(const char *label, Geom::Affine ctm)
void pdf_debug_object(const Object *obj, int depth, XRef *xref)
PDF parsing utilities for libpoppler.
std::shared_ptr< std::map< FontPtr, FontData > > FontList
std::shared_ptr< GfxFont > FontPtr
std::string validateString(std::string const &in)
Ensure string is valid UTF8.