Skip to content

Commit 14bc7fb

Browse files
author
vidy
committed
Rename tree to node
1 parent 5182bab commit 14bc7fb

8 files changed

Lines changed: 321 additions & 48 deletions

File tree

src/classify.rs

Lines changed: 35 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@ use pdf_render::TextSpan;
55

66
use crate::util::is_number;
77

8-
use super::util::Tri;
9-
108
#[derive(Copy, Clone, Debug, PartialEq)]
119
pub enum Class {
1210
Number,
@@ -15,33 +13,6 @@ pub enum Class {
1513
Mixed,
1614
}
1715

18-
#[derive(Debug)]
19-
pub struct TriCount {
20-
tru: usize,
21-
fal: usize,
22-
}
23-
impl TriCount {
24-
fn new() -> Self {
25-
TriCount {
26-
tru: 0,
27-
fal: 0
28-
}
29-
}
30-
fn add(&mut self, b: bool) {
31-
match b {
32-
false => self.fal += 1,
33-
true => self.tru += 1,
34-
}
35-
}
36-
fn count(&self) -> Tri {
37-
match (self.fal, self.tru) {
38-
(0, 0) => Tri::Unknown,
39-
(0, _) => Tri::True,
40-
(_, 0) => Tri::False,
41-
(f, t) => Tri::Maybe(t as f32 / (t + f) as f32)
42-
}
43-
}
44-
}
4516
pub fn classify<'a, E: Encoder + 'a>(spans: impl Iterator<Item=&'a TextSpan<E>>) -> Class {
4617
use pdf_render::FontEntry;
4718

@@ -72,4 +43,39 @@ pub fn classify<'a, E: Encoder + 'a>(spans: impl Iterator<Item=&'a TextSpan<E>>)
7243
(_, Tri::Maybe(_), _) => Class::Paragraph,
7344
_ => Class::Mixed
7445
}
46+
}
47+
48+
pub enum Tri {
49+
False,
50+
True,
51+
Maybe(f32),
52+
Unknown,
53+
}
54+
55+
#[derive(Debug)]
56+
pub struct TriCount {
57+
tru: usize,
58+
fal: usize,
59+
}
60+
impl TriCount {
61+
fn new() -> Self {
62+
TriCount {
63+
tru: 0,
64+
fal: 0
65+
}
66+
}
67+
fn add(&mut self, b: bool) {
68+
match b {
69+
false => self.fal += 1,
70+
true => self.tru += 1,
71+
}
72+
}
73+
fn count(&self) -> Tri {
74+
match (self.fal, self.tru) {
75+
(0, 0) => Tri::Unknown,
76+
(0, _) => Tri::True,
77+
(_, 0) => Tri::False,
78+
(f, t) => Tri::Maybe(t as f32 / (t + f) as f32)
79+
}
80+
}
7581
}

src/flow.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use crate::classify::{classify, Class};
2-
use crate::tree::{Node, NodeTag};
2+
use crate::node::{Node, NodeTag};
33
use crate::util::{avg, CellContent, Rect};
44
use crate::text::concat_text;
55
use std::iter::once;

src/lib.rs

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use pathfinder_geometry::transform2d::Transform2F;
55
use pdf::{backend::Backend, object::{Page, Resolve}, PdfError};
66
use pdf_render::{tracer::{TraceCache, Tracer, DrawItem}, Fill, render_pattern, render_page, FillMode, font::OutlineBuilder};
77

8-
mod tree;
8+
mod node;
99
mod util;
1010
mod text;
1111
mod classify;
@@ -88,16 +88,8 @@ pub fn run<B: Backend>(file: &pdf::file::CachedFile<B>, page: &Page, resolve: &i
8888
for item in items {
8989
visit_item(item);
9090
}
91-
92-
spans.sort_unstable_by(|a, b| a.rect.min_y().partial_cmp(&b.rect.min_y()).unwrap());
9391

94-
spans.sort_unstable_by(|a, b| a.rect.min_x().partial_cmp(&b.rect.min_x()).unwrap());
95-
96-
for s in spans.iter().map(|s|s.text.as_str()) {
97-
println!(":{}", s)
98-
}
99-
100-
let root = tree::build(&spans, bbox, &lines);
92+
let root = node::build(&spans, bbox, &lines);
10193

10294
let mut flow = Flow::new();
10395
flow::build(&mut flow, &spans, &root, bbox.min_x());
File renamed without changes.

src/node/gap.rs

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
use ordered_float::NotNan;
2+
use pathfinder_geometry::rect::RectF;
3+
4+
pub fn gap_list<'a>(boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator<Item=(f32, f32, usize)> + 'a {
5+
let mut boxes = boxes.iter();
6+
let &(ref r, _) = boxes.next().unwrap();
7+
let (_, mut last_max) = span(r);
8+
boxes.enumerate().filter_map(move |(idx, &(ref r, _))| {
9+
// top left y, bottom right y
10+
let (min, max) = span(&r);
11+
let r = if min > last_max {
12+
Some((last_max, min, idx+1))
13+
} else {
14+
None
15+
};
16+
last_max = max.max(last_max);
17+
r
18+
})
19+
}
20+
21+
pub fn gaps<'a>(threshold: f32, boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator<Item=f32> + 'a {
22+
let mut boxes = boxes.iter();
23+
let &(ref r, _) = boxes.next().unwrap();
24+
let (_, mut last_max) = span(r);
25+
boxes.filter_map(move |&(ref r, _)| {
26+
let (min, max) = span(&r);
27+
let r = if min - last_max >= threshold {
28+
Some(0.5 * (last_max + min))
29+
} else {
30+
None
31+
};
32+
last_max = max.max(last_max);
33+
r
34+
})
35+
}
36+
37+
pub fn max_gap(boxes: &[(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32)) -> Option<(f32, f32)> {
38+
gap_list(boxes, span)
39+
.max_by_key(|&(a, b, _)| NotNan::new(b - a).unwrap())
40+
.map(|(a, b, _)| (b - a, 0.5 * (a + b)))
41+
}
42+
43+
pub fn dist_x(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> {
44+
max_gap(boxes, |r| (r.min_x(), r.max_x()))
45+
}
46+
pub fn dist_y(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> {
47+
max_gap(boxes, |r| (r.min_y(), r.max_y()))
48+
}
49+
50+
pub fn top_bottom_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option<usize>, Option<usize>) {
51+
let num_boxes = boxes.len();
52+
if num_boxes < 2 {
53+
return (None, None);
54+
}
55+
56+
let mut gaps = gap_list(boxes, |r| (
57+
// top left y
58+
r.min_y(),
59+
// bottom right y
60+
r.max_y()
61+
));
62+
let top_limit = bbox.min_y() + bbox.height() * 0.2;
63+
let bottom_limit = bbox.min_y() + bbox.height() * 0.8;
64+
65+
match gaps.next() {
66+
Some((y, _, top)) if y < top_limit => {
67+
match gaps.last() {
68+
Some((y, _, bottom)) if y > bottom_limit => (Some(top), Some(bottom)),
69+
_ => (Some(top), None)
70+
}
71+
}
72+
Some((y, _, bottom)) if y > bottom_limit => (None, Some(bottom)),
73+
_ => (None, None)
74+
}
75+
}
76+
77+
pub fn left_right_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option<usize>, Option<usize>) {
78+
let num_boxes = boxes.len();
79+
if num_boxes < 2 {
80+
return (None, None);
81+
}
82+
83+
let mut gaps = gap_list(boxes, |r| (r.min_x(), r.max_x()));
84+
let left_limit = bbox.min_x() + bbox.width() * 0.2;
85+
let right_limit = bbox.min_x() + bbox.width() * 0.8;
86+
match gaps.next() {
87+
Some((x, _, left)) if x < left_limit => {
88+
match gaps.last() {
89+
Some((x, _, right)) if x > right_limit => (Some(left), Some(right)),
90+
_ => (Some(left), None)
91+
}
92+
}
93+
Some((x, _, right)) if x > right_limit => (None, Some(right)),
94+
_ => (None, None)
95+
}
96+
}

src/node/line.rs

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
2+
use std::collections::BTreeSet;
3+
use ordered_float::NotNan;
4+
use pathfinder_geometry::rect::RectF;
5+
6+
use crate::util::avg;
7+
8+
use super::{sort_x, sort_y, Node, NodeTag};
9+
10+
pub fn analyze_lines(lines: &[[f32; 4]]) -> Lines {
11+
let mut hlines = BTreeSet::new();
12+
let mut vlines = BTreeSet::new();
13+
14+
for &[x1, y1, x2, y2] in lines {
15+
if x1 == x2 {
16+
vlines.insert(NotNan::new(x1).unwrap());
17+
} else if y1 == y2 {
18+
hlines.insert(NotNan::new(y1).unwrap());
19+
}
20+
}
21+
22+
fn dedup(lines: impl Iterator<Item=NotNan<f32>>) -> Vec<(f32, f32)> {
23+
let threshold = 10.0;
24+
let mut out = vec![];
25+
let mut lines = lines.map(|f| *f).peekable();
26+
while let Some(start) = lines.next() {
27+
let mut last = start;
28+
while let Some(&p) = lines.peek() {
29+
if last + threshold > p {
30+
last = p;
31+
lines.next();
32+
} else {
33+
break;
34+
}
35+
}
36+
out.push((start, last));
37+
}
38+
out
39+
}
40+
41+
let hlines = dedup(hlines.iter().cloned());
42+
let vlines = dedup(vlines.iter().cloned());
43+
44+
let mut line_grid = vec![false; vlines.len() * hlines.len()];
45+
for &[x1, y1, x2, y2] in lines {
46+
if x1 == x2 {
47+
let v_idx = vlines.iter().position(|&(a, b)| a <= x1 && x1 <= b).unwrap_or(vlines.len());
48+
let h_start = hlines.iter().position(|&(a, b)| y1 >= a).unwrap_or(hlines.len());
49+
let h_end = hlines.iter().position(|&(a, b)| y2 <= b).unwrap_or(hlines.len());
50+
for h in h_start .. h_end {
51+
line_grid[v_idx * hlines.len() + h] = true;
52+
}
53+
} else if y1 == y2 {
54+
let h_idx = hlines.iter().position(|&(a, b)| a <= y1 && y1 <= b).unwrap_or(hlines.len());
55+
let v_start = vlines.iter().position(|&(a, b)| x1 >= a).unwrap_or(vlines.len());
56+
let v_end = vlines.iter().position(|&(a, b)| x2 <= b).unwrap_or(vlines.len());
57+
for v in v_start .. v_end {
58+
line_grid[v * hlines.len() + h_idx] = true;
59+
}
60+
}
61+
}
62+
63+
64+
//println!("hlines: {:?}", hlines);
65+
//println!("vlines: {:?}", vlines);
66+
67+
Lines { hlines, vlines, line_grid }
68+
}
69+
70+
pub struct Lines {
71+
pub hlines: Vec<(f32, f32)>,
72+
pub vlines: Vec<(f32, f32)>,
73+
pub line_grid: Vec<bool>,
74+
}
75+
76+
pub fn overlapping_lines(boxes: &mut [(RectF, usize)]) -> Node {
77+
sort_y(boxes);
78+
let avg_height = avg(boxes.iter().map(|(r, _)| r.height())).unwrap();
79+
80+
let mut y_center = boxes[0].0.center().y();
81+
let mut lines = vec![];
82+
let mut y_splits = vec![];
83+
84+
let mut start = 0;
85+
'a: loop {
86+
for (i, &(r, _)) in boxes[start..].iter().enumerate() {
87+
if r.center().y() > 0.5 * avg_height + y_center {
88+
let end = start + i;
89+
sort_x(&mut boxes[start..end]);
90+
let bbox = boxes[start..end].iter().map(|&(r, _)| r).reduce(|a, b| a.union_rect(b)).unwrap();
91+
92+
y_splits.push(bbox.max_y());
93+
lines.push(Node::singleton(&boxes[start..end]));
94+
y_center = r.center().y();
95+
96+
start = end;
97+
continue 'a;
98+
}
99+
}
100+
101+
sort_x(&mut boxes[start..]);
102+
lines.push(Node::singleton(&boxes[start..]));
103+
104+
break;
105+
}
106+
match lines.len() {
107+
0 => Node::singleton(&[]),
108+
1 => lines.pop().unwrap(),
109+
_ => Node::Grid {
110+
x: vec![],
111+
y: y_splits,
112+
cells: lines,
113+
tag: NodeTag::Paragraph
114+
}
115+
}
116+
}

0 commit comments

Comments
 (0)