/usr/include/mupdf/fitz/structured-text.h is in libmupdf-dev 1.7a-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 | #ifndef MUPDF_FITZ_STRUCTURED_TEXT_H
#define MUPDF_FITZ_STRUCTURED_TEXT_H
#include "mupdf/fitz/system.h"
#include "mupdf/fitz/context.h"
#include "mupdf/fitz/math.h"
#include "mupdf/fitz/font.h"
#include "mupdf/fitz/colorspace.h"
#include "mupdf/fitz/image.h"
#include "mupdf/fitz/output.h"
#include "mupdf/fitz/device.h"
/*
Text extraction device: Used for searching, format conversion etc.
(In development - Subject to change in future versions)
*/
typedef struct fz_text_style_s fz_text_style;
typedef struct fz_text_char_s fz_text_char;
typedef struct fz_text_span_s fz_text_span;
typedef struct fz_text_line_s fz_text_line;
typedef struct fz_text_block_s fz_text_block;
typedef struct fz_image_block_s fz_image_block;
typedef struct fz_page_block_s fz_page_block;
typedef struct fz_text_sheet_s fz_text_sheet;
typedef struct fz_text_page_s fz_text_page;
/*
fz_text_sheet: A text sheet contains a list of distinct text styles
used on a page (or a series of pages).
*/
struct fz_text_sheet_s
{
int maxid;
fz_text_style *style;
};
/*
fz_text_style: A text style contains details of a distinct text style
used on a page.
*/
struct fz_text_style_s
{
fz_text_style *next;
int id;
fz_font *font;
float size;
int wmode;
int script;
/* Ascender and Descender only have the conventional sense in
* horizontal mode; in vertical mode they are rotated too - they are
* the maximum and minimum bounds respectively. */
float ascender;
float descender;
/* etc... */
};
/*
fz_text_page: A text page is a list of page blocks, together with
an overall bounding box.
*/
struct fz_text_page_s
{
fz_rect mediabox;
int len, cap;
fz_page_block *blocks;
fz_text_page *next;
};
/*
fz_page_block: A page block is a typed block pointer.
*/
struct fz_page_block_s
{
int type;
union
{
fz_text_block *text;
fz_image_block *image;
} u;
};
enum
{
FZ_PAGE_BLOCK_TEXT = 0,
FZ_PAGE_BLOCK_IMAGE = 1
};
/*
fz_text_block: A text block is a list of lines of text. In typical
cases this may correspond to a paragraph or a column of text. A
collection of blocks makes up a page.
*/
struct fz_text_block_s
{
fz_rect bbox;
int len, cap;
fz_text_line *lines;
};
/*
fz_image_block: An image block is an image, together with the list of lines of text. In typical
cases this may correspond to a paragraph or a column of text. A
collection of blocks makes up a page.
*/
struct fz_image_block_s
{
fz_rect bbox;
fz_matrix mat;
fz_image *image;
fz_colorspace *cspace;
float colors[FZ_MAX_COLORS];
};
/*
fz_text_line: A text line is a list of text spans, with the same
baseline. In typical cases this should correspond (as expected) to
complete lines of text. A collection of lines makes up a block.
*/
struct fz_text_line_s
{
fz_text_span *first_span, *last_span;
/* Cached information */
float distance; /* Perpendicular distance from previous line */
fz_rect bbox;
void *region; /* Opaque value for matching line masks */
};
/*
fz_text_span: A text span is a list of characters that share a common
baseline/transformation. In typical cases a single span may be enough
to represent a complete line. In cases where the text has big gaps in
it (perhaps as it crosses columns or tables), a line may be represented
by multiple spans.
*/
struct fz_text_span_s
{
int len, cap;
fz_text_char *text;
fz_point min; /* Device space */
fz_point max; /* Device space */
int wmode; /* 0 for horizontal, 1 for vertical */
fz_matrix transform; /* e and f are always 0 here */
/* Ascender_max and Descender_min only have the conventional sense in
* horizontal mode; in vertical mode they are rotated too - they are
* the maximum and minimum bounds respectively. */
float ascender_max; /* Document space */
float descender_min; /* Document space */
fz_rect bbox; /* Device space */
/* Cached information */
float base_offset; /* Perpendicular distance from baseline of line */
float spacing; /* Distance along baseline from previous span in this line (or 0 if first) */
int column; /* If non zero, the column that it's in */
float column_width; /* Percentage */
int align; /* 0 = left, 1 = centre, 2 = right */
float indent; /* The indent position for this column. */
fz_text_span *next;
};
/*
fz_text_char: A text char is a unicode character, the style in which
is appears, and the point at which it is positioned. Transform
(and hence bbox) information is given by the enclosing span.
*/
struct fz_text_char_s
{
fz_point p; /* Device space */
int c;
fz_text_style *style;
};
typedef struct fz_char_and_box_s fz_char_and_box;
struct fz_char_and_box_s
{
int c;
fz_rect bbox;
};
fz_char_and_box *fz_text_char_at(fz_context *ctx, fz_char_and_box *cab, fz_text_page *page, int idx);
/*
fz_text_char_bbox: Return the bbox of a text char. Calculated from
the supplied enclosing span.
bbox: A place to store the bbox
span: The enclosing span
idx: The index of the char within the span
Returns bbox (updated)
Does not throw exceptions
*/
fz_rect *fz_text_char_bbox(fz_context *ctx, fz_rect *bbox, fz_text_span *span, int idx);
/*
fz_new_text_sheet: Create an empty style sheet.
The style sheet is filled out by the text device, creating
one style for each unique font, color, size combination that
is used.
*/
fz_text_sheet *fz_new_text_sheet(fz_context *ctx);
void fz_drop_text_sheet(fz_context *ctx, fz_text_sheet *sheet);
/*
fz_new_text_page: Create an empty text page.
The text page is filled out by the text device to contain the blocks,
lines and spans of text on the page.
*/
fz_text_page *fz_new_text_page(fz_context *ctx);
void fz_drop_text_page(fz_context *ctx, fz_text_page *page);
void fz_analyze_text(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page);
/*
fz_print_text_sheet: Output a text sheet to a file as CSS.
*/
void fz_print_text_sheet(fz_context *ctx, fz_output *out, fz_text_sheet *sheet);
/*
fz_print_text_page_html: Output a page to a file in HTML format.
*/
void fz_print_text_page_html(fz_context *ctx, fz_output *out, fz_text_page *page);
/*
fz_print_text_page_xml: Output a page to a file in XML format.
*/
void fz_print_text_page_xml(fz_context *ctx, fz_output *out, fz_text_page *page);
/*
fz_print_text_page: Output a page to a file in UTF-8 format.
*/
void fz_print_text_page(fz_context *ctx, fz_output *out, fz_text_page *page);
/*
fz_search_text_page: Search for occurrence of 'needle' in text page.
Return the number of hits and store hit bboxes in the passed in array.
NOTE: This is an experimental interface and subject to change without notice.
*/
int fz_search_text_page(fz_context *ctx, fz_text_page *text, const char *needle, fz_rect *hit_bbox, int hit_max);
/*
fz_highlight_selection: Return a list of rectangles to highlight given a selection rectangle.
NOTE: This is an experimental interface and subject to change without notice.
*/
int fz_highlight_selection(fz_context *ctx, fz_text_page *page, fz_rect rect, fz_rect *hit_bbox, int hit_max);
/*
fz_copy_selection: Return a newly allocated UTF-8 string with the text for a given selection rectangle.
NOTE: This is an experimental interface and subject to change without notice.
*/
char *fz_copy_selection(fz_context *ctx, fz_text_page *page, fz_rect rect);
/*
fz_new_text_device: Create a device to extract the text on a page.
Gather and sort the text on a page into spans of uniform style,
arranged into lines and blocks by reading order. The reading order
is determined by various heuristics, so may not be accurate.
sheet: The text sheet to which styles should be added. This can
either be a newly created (empty) text sheet, or one containing
styles from a previous text device. The same sheet cannot be used
in multiple threads simultaneously.
page: The text page to which content should be added. This will
usually be a newly created (empty) text page, but it can be one
containing data already (for example when merging multiple pages, or
watermarking).
*/
fz_device *fz_new_text_device(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page);
#endif
|