| // Copyright 2025 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "components/pdf/renderer/pdf_accessibility_tree_builder_heuristic.h" |
| |
| #include <algorithm> |
| #include <cmath> |
| #include <optional> |
| #include <string> |
| #include <vector> |
| |
| #include "base/check.h" |
| #include "base/check_op.h" |
| #include "base/containers/fixed_flat_map.h" |
| #include "base/feature_list.h" |
| #include "base/memory/raw_ref.h" |
| #include "components/pdf/renderer/pdf_accessibility_tree_builder.h" |
| #include "pdf/accessibility_structs.h" |
| #include "pdf/pdf_features.h" |
| #include "ui/accessibility/ax_enums.mojom-shared.h" |
| #include "ui/accessibility/ax_node_data.h" |
| #include "ui/gfx/geometry/point_f.h" |
| #include "ui/gfx/geometry/rect_f.h" |
| |
| namespace { |
| |
| // Don't try to apply font size thresholds to automatically identify headings |
| // if the median font size is not at least this many points. |
| constexpr float kMinimumFontSize = 5.0f; |
| |
| // Don't try to apply paragraph break thresholds to automatically identify |
| // paragraph breaks if the median line break is not at least this many points. |
| constexpr float kMinimumLineSpacing = 5.0f; |
| |
| // Ratio between the font size of one text run and the median on the page |
| // for that text run to be considered to be a heading instead of normal text. |
| constexpr float kHeadingFontSizeRatio = 1.2f; |
| |
| // Ratio between the line spacing between two lines and the median on the |
| // page for that line spacing to be considered a paragraph break. |
| constexpr float kParagraphLineSpacingRatio = 1.2f; |
| |
| // This class is used as part of our heuristic to determine which text runs live |
| // on the same "line". As we process runs, we keep a weighted average of the |
| // top and bottom coordinates of the line, and if a new run falls within that |
| // range (within a threshold) it is considered part of the line. |
| class LineHelper { |
| public: |
| explicit LineHelper( |
| const std::vector<chrome_pdf::AccessibilityTextRunInfo>& text_runs) |
| : text_runs_(text_runs) { |
| StartNewLine(0); |
| } |
| |
| LineHelper(const LineHelper&) = delete; |
| LineHelper& operator=(const LineHelper&) = delete; |
| |
| void StartNewLine(size_t current_index) { |
| DCHECK(current_index == 0 || current_index < text_runs_->size()); |
| start_index_ = current_index; |
| accumulated_weight_top_ = 0.0f; |
| accumulated_weight_bottom_ = 0.0f; |
| accumulated_width_ = 0.0f; |
| } |
| |
| void ProcessNextRun(size_t run_index) { |
| DCHECK_LT(run_index, text_runs_->size()); |
| RemoveOldRunsUpTo(run_index); |
| AddRun((*text_runs_)[run_index].bounds); |
| } |
| |
| bool IsRunOnSameLine(size_t run_index) const { |
| DCHECK_LT(run_index, text_runs_->size()); |
| |
| // Calculate new top/bottom bounds for our line. |
| if (accumulated_width_ == 0.0f) { |
| return false; |
| } |
| |
| float line_top = accumulated_weight_top_ / accumulated_width_; |
| float line_bottom = accumulated_weight_bottom_ / accumulated_width_; |
| |
| // Look at the next run, and determine how much it overlaps the line. |
| const auto& run_bounds = (*text_runs_)[run_index].bounds; |
| if (run_bounds.height() == 0.0f) { |
| return false; |
| } |
| |
| float clamped_top = std::max(line_top, run_bounds.y()); |
| float clamped_bottom = |
| std::min(line_bottom, run_bounds.y() + run_bounds.height()); |
| if (clamped_bottom < clamped_top) { |
| return false; |
| } |
| |
| float coverage = (clamped_bottom - clamped_top) / (run_bounds.height()); |
| |
| // See if it falls within the line (within our threshold). |
| constexpr float kLineCoverageThreshold = 0.25f; |
| return coverage > kLineCoverageThreshold; |
| } |
| |
| private: |
| void AddRun(const gfx::RectF& run_bounds) { |
| float run_width = fabsf(run_bounds.width()); |
| accumulated_width_ += run_width; |
| accumulated_weight_top_ += run_bounds.y() * run_width; |
| accumulated_weight_bottom_ += |
| (run_bounds.y() + run_bounds.height()) * run_width; |
| } |
| |
| void RemoveRun(const gfx::RectF& run_bounds) { |
| float run_width = fabsf(run_bounds.width()); |
| accumulated_width_ -= run_width; |
| accumulated_weight_top_ -= run_bounds.y() * run_width; |
| accumulated_weight_bottom_ -= |
| (run_bounds.y() + run_bounds.height()) * run_width; |
| } |
| |
| void RemoveOldRunsUpTo(size_t stop_index) { |
| // Remove older runs from the weighted average if we've exceeded the |
| // threshold distance from them. We remove them to prevent e.g. drop-caps |
| // from unduly influencing future lines. |
| constexpr float kBoxRemoveWidthThreshold = 3.0f; |
| while (start_index_ < stop_index && |
| accumulated_width_ > (*text_runs_)[start_index_].bounds.width() * |
| kBoxRemoveWidthThreshold) { |
| const auto& old_bounds = (*text_runs_)[start_index_].bounds; |
| RemoveRun(old_bounds); |
| start_index_++; |
| } |
| } |
| |
| const raw_ref<const std::vector<chrome_pdf::AccessibilityTextRunInfo>> |
| text_runs_; |
| size_t start_index_; |
| float accumulated_weight_top_; |
| float accumulated_weight_bottom_; |
| float accumulated_width_; |
| }; |
| |
| // Please keep the below map as close as possible to the list defined in the PDF |
| // Specification, ISO 32000-1:2008, table 333. |
| ax::mojom::Role StructureElementTypeToAccessibilityRole( |
| const std::string& element_type) { |
| static constexpr auto kStructureElementTypeToAccessibilityRoleMap = |
| base::MakeFixedFlatMap<std::string_view, ax::mojom::Role>( |
| {{"Document", ax::mojom::Role::kDocument}, |
| {"Part", ax::mojom::Role::kDocPart}, |
| {"Art", ax::mojom::Role::kArticle}, |
| {"Sect", ax::mojom::Role::kSection}, |
| {"Div", ax::mojom::Role::kGenericContainer}, |
| {"BlockQuote", ax::mojom::Role::kBlockquote}, |
| {"Caption", ax::mojom::Role::kCaption}, |
| {"TOC", ax::mojom::Role::kDocToc}, |
| {"TOCI", ax::mojom::Role::kListItem}, |
| {"Index", ax::mojom::Role::kDocIndex}, |
| {"P", ax::mojom::Role::kParagraph}, |
| {"H", ax::mojom::Role::kHeading}, |
| {"H1", ax::mojom::Role::kHeading}, |
| {"H2", ax::mojom::Role::kHeading}, |
| {"H3", ax::mojom::Role::kHeading}, |
| {"H4", ax::mojom::Role::kHeading}, |
| {"H5", ax::mojom::Role::kHeading}, |
| {"H6", ax::mojom::Role::kHeading}, |
| {"L", ax::mojom::Role::kList}, |
| {"LI", ax::mojom::Role::kListItem}, |
| {"Lbl", ax::mojom::Role::kListMarker}, |
| {"LBody", ax::mojom::Role::kNone}, // Presentational. |
| {"Table", ax::mojom::Role::kTable}, |
| {"TR", ax::mojom::Role::kRow}, |
| {"TH", ax::mojom::Role::kRowHeader}, |
| {"THead", ax::mojom::Role::kRowGroup}, |
| {"TBody", ax::mojom::Role::kRowGroup}, |
| {"TFoot", ax::mojom::Role::kRowGroup}, |
| {"TD", ax::mojom::Role::kCell}, |
| {"Span", ax::mojom::Role::kStaticText}, |
| {"Link", ax::mojom::Role::kLink}, |
| {"Figure", ax::mojom::Role::kFigure}, |
| {"Formula", ax::mojom::Role::kMath}, |
| {"Form", ax::mojom::Role::kForm}}); |
| |
| if (auto iter = |
| kStructureElementTypeToAccessibilityRoleMap.find(element_type); |
| iter != kStructureElementTypeToAccessibilityRoleMap.end()) { |
| return iter->second; |
| } |
| // Return something that could at least make some sense, other than |
| // `kUnknown`. |
| return ax::mojom::Role::kParagraph; |
| } |
| |
| std::optional<uint32_t> StructureElementTypeToHeadingLevel( |
| const std::string& element_type) { |
| if (StructureElementTypeToAccessibilityRole(element_type) == |
| ax::mojom::Role::kHeading) { |
| if (element_type == "H" || element_type == "H1") { |
| return 1; |
| } else if (element_type == "H2") { |
| return 2; |
| } else if (element_type == "H3") { |
| return 3; |
| } else if (element_type == "H4") { |
| return 4; |
| } else if (element_type == "H5") { |
| return 5; |
| } else if (element_type == "H6") { |
| return 6; |
| } |
| } |
| return std::nullopt; |
| } |
| |
| template <typename T> |
| bool IsObjectInTextRun(const std::vector<T>& objects, |
| uint32_t object_index, |
| size_t text_run_index) { |
| return (object_index < objects.size() && |
| objects[object_index].text_run_index <= text_run_index); |
| } |
| |
| template <typename T> |
| bool IsObjectWithRangeInTextRun(const std::vector<T>& objects, |
| uint32_t object_index, |
| size_t text_run_index) { |
| return (object_index < objects.size() && |
| objects[object_index].text_range.index <= text_run_index); |
| } |
| |
| size_t NormalizeTextRunIndex(uint32_t object_end_text_run_index, |
| size_t current_text_run_index) { |
| return std::max<size_t>( |
| object_end_text_run_index, |
| current_text_run_index ? current_text_run_index - 1 : 0); |
| } |
| |
| void ComputeParagraphAndHeadingThresholds( |
| const std::vector<chrome_pdf::AccessibilityTextRunInfo>& text_runs, |
| float* out_heading_font_size_threshold, |
| float* out_paragraph_spacing_threshold) { |
| // Scan over the font sizes and line spacing within this page and |
| // set heuristic thresholds so that text larger than the median font |
| // size can be marked as a heading, and spacing larger than the median |
| // line spacing can be a paragraph break. |
| std::vector<float> font_sizes; |
| std::vector<float> line_spacings; |
| for (size_t i = 0; i < text_runs.size(); ++i) { |
| font_sizes.push_back(text_runs[i].style.font_size); |
| if (i > 0) { |
| const auto& cur = text_runs[i].bounds; |
| const auto& prev = text_runs[i - 1].bounds; |
| if (cur.y() > prev.y() + prev.height() / 2) { |
| line_spacings.push_back(cur.y() - prev.y()); |
| } |
| } |
| } |
| if (font_sizes.size() > 2) { |
| std::sort(font_sizes.begin(), font_sizes.end()); |
| float median_font_size = font_sizes[font_sizes.size() / 2]; |
| if (median_font_size > kMinimumFontSize) { |
| *out_heading_font_size_threshold = |
| median_font_size * kHeadingFontSizeRatio; |
| } |
| } |
| if (line_spacings.size() > 4) { |
| std::sort(line_spacings.begin(), line_spacings.end()); |
| float median_line_spacing = line_spacings[line_spacings.size() / 2]; |
| if (median_line_spacing > kMinimumLineSpacing) { |
| *out_paragraph_spacing_threshold = |
| median_line_spacing * kParagraphLineSpacingRatio; |
| } |
| } |
| } |
| |
| bool BreakParagraph( |
| const std::vector<chrome_pdf::AccessibilityTextRunInfo>& text_runs, |
| uint32_t text_run_index, |
| float paragraph_spacing_threshold) { |
| // Check to see if its also a new paragraph, i.e., if the distance between |
| // lines is greater than the threshold. If there's no threshold, that |
| // means there weren't enough lines to compute an accurate median, so |
| // we compare against the line size instead. |
| float line_spacing = fabsf(text_runs[text_run_index + 1].bounds.y() - |
| text_runs[text_run_index].bounds.y()); |
| return ((paragraph_spacing_threshold > 0 && |
| line_spacing > paragraph_spacing_threshold) || |
| (paragraph_spacing_threshold == 0 && |
| line_spacing > kParagraphLineSpacingRatio * |
| text_runs[text_run_index].bounds.height())); |
| } |
| |
| void BuildStaticNode(ui::AXNodeData** static_text_node, |
| std::string* static_text) { |
| // If we're in the middle of building a static text node, finish it before |
| // moving on to the next object. |
| if (*static_text_node) { |
| (*static_text_node) |
| ->AddStringAttribute(ax::mojom::StringAttribute::kName, (*static_text)); |
| static_text->clear(); |
| } |
| *static_text_node = nullptr; |
| } |
| |
| void ConnectPreviousAndNextOnLine(ui::AXNodeData* previous_on_line_node, |
| ui::AXNodeData* next_on_line_node) { |
| previous_on_line_node->AddIntAttribute(ax::mojom::IntAttribute::kNextOnLineId, |
| next_on_line_node->id); |
| next_on_line_node->AddIntAttribute(ax::mojom::IntAttribute::kPreviousOnLineId, |
| previous_on_line_node->id); |
| } |
| |
| } // namespace |
| |
| namespace pdf { |
| |
| PdfAccessibilityTreeBuilderHeuristic::PdfAccessibilityTreeBuilderHeuristic( |
| PdfAccessibilityTreeBuilder& builder) |
| : builder_(builder) {} |
| |
| void PdfAccessibilityTreeBuilderHeuristic::BuildPageTree() { |
| ComputeParagraphAndHeadingThresholds(builder_->text_runs(), |
| &heading_font_size_threshold_, |
| ¶graph_spacing_threshold_); |
| |
| ui::AXNodeData* block_node = nullptr; |
| ui::AXNodeData* static_text_node = nullptr; |
| ui::AXNodeData* previous_on_line_node = nullptr; |
| std::string static_text; |
| LineHelper line_helper(builder_->text_runs()); |
| bool pdf_forms_enabled = |
| base::FeatureList::IsEnabled(chrome_pdf::features::kAccessiblePDFForm); |
| #if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE) |
| bool ocr_block = false; |
| bool has_ocr_text = false; |
| #endif |
| |
| for (size_t text_run_index = 0; text_run_index < builder_->text_runs().size(); |
| ++text_run_index) { |
| const chrome_pdf::AccessibilityTextRunInfo& text_run = |
| (builder_->text_runs())[text_run_index]; |
| |
| #if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE) |
| // OCR text should be marked by nodes before and after it. |
| bool ocr_block_start = text_run.is_searchified && !ocr_block; |
| bool ocr_block_end = !text_run.is_searchified && ocr_block; |
| if (ocr_block_start || ocr_block_end) { |
| // If already inside a block, end it. |
| // PDF searchifier only processes pages that have no text, hence OCR text |
| // is never added in the middle of a paragraph. |
| if (block_node) { |
| BuildStaticNode(&static_text_node, &static_text); |
| block_node = nullptr; |
| } |
| CHECK(ocr_block_start || text_run_index); |
| gfx::PointF position = ocr_block_start |
| ? text_run.bounds.origin() |
| : (builder_->text_runs())[text_run_index - 1] |
| .bounds.bottom_right(); |
| builder_->page_node()->child_ids.push_back( |
| builder_->CreateOcrWrapperNode(position, ocr_block_start)->id); |
| ocr_block = ocr_block_start; |
| has_ocr_text = true; |
| } |
| #endif // BUILDFLAG(ENABLE_SCREEN_AI_SERVICE) |
| // If we don't have a block level node, create one. |
| if (!block_node) { |
| block_node = |
| CreateBlockLevelNode(text_run.tag_type, text_run.style.font_size); |
| builder_->page_node()->child_ids.push_back(block_node->id); |
| } |
| |
| // If the `text_run_index` is less than or equal to the link's |
| // `text_run_index`, then push the link node in the block. |
| if (IsObjectWithRangeInTextRun(builder_->links(), current_link_index_, |
| text_run_index)) { |
| BuildStaticNode(&static_text_node, &static_text); |
| const chrome_pdf::AccessibilityLinkInfo& link = |
| (builder_->links())[current_link_index_++]; |
| AddLinkToParaNode(link, block_node, &previous_on_line_node, |
| &text_run_index); |
| |
| if (link.text_range.count == 0) { |
| continue; |
| } |
| |
| } else if (IsObjectInTextRun(builder_->images(), current_image_index_, |
| text_run_index)) { |
| BuildStaticNode(&static_text_node, &static_text); |
| AddImageToParaNode((builder_->images())[current_image_index_++], |
| block_node, &text_run_index); |
| continue; |
| } else if (IsObjectWithRangeInTextRun(builder_->highlights(), |
| current_highlight_index_, |
| text_run_index)) { |
| BuildStaticNode(&static_text_node, &static_text); |
| AddHighlightToParaNode( |
| (builder_->highlights())[current_highlight_index_++], block_node, |
| &previous_on_line_node, &text_run_index); |
| } else if (IsObjectInTextRun(builder_->text_fields(), |
| current_text_field_index_, text_run_index) && |
| pdf_forms_enabled) { |
| BuildStaticNode(&static_text_node, &static_text); |
| AddTextFieldToParaNode( |
| (builder_->text_fields())[current_text_field_index_++], block_node, |
| &text_run_index); |
| continue; |
| } else if (IsObjectInTextRun(builder_->buttons(), current_button_index_, |
| text_run_index) && |
| pdf_forms_enabled) { |
| BuildStaticNode(&static_text_node, &static_text); |
| AddButtonToParaNode((builder_->buttons())[current_button_index_++], |
| block_node, &text_run_index); |
| continue; |
| } else if (IsObjectInTextRun(builder_->choice_fields(), |
| current_choice_field_index_, text_run_index) && |
| pdf_forms_enabled) { |
| BuildStaticNode(&static_text_node, &static_text); |
| AddChoiceFieldToParaNode( |
| (builder_->choice_fields())[current_choice_field_index_++], |
| block_node, &text_run_index); |
| continue; |
| } else { |
| chrome_pdf::PageCharacterIndex page_char_index = { |
| builder_->page_index(), |
| builder_->text_run_start_indices()[text_run_index]}; |
| |
| // This node is for the text inside the block, it includes the text of all |
| // of the text runs. |
| if (!static_text_node) { |
| static_text_node = builder_->CreateStaticTextNode(page_char_index); |
| block_node->child_ids.push_back(static_text_node->id); |
| } |
| |
| // Add this text run to the current static text node. |
| ui::AXNodeData* inline_text_box_node = |
| builder_->CreateInlineTextBoxNode(text_run, page_char_index); |
| static_text_node->child_ids.push_back(inline_text_box_node->id); |
| |
| static_text += inline_text_box_node->GetStringAttribute( |
| ax::mojom::StringAttribute::kName); |
| |
| block_node->relative_bounds.bounds.Union( |
| inline_text_box_node->relative_bounds.bounds); |
| static_text_node->relative_bounds.bounds.Union( |
| inline_text_box_node->relative_bounds.bounds); |
| |
| if (previous_on_line_node) { |
| ConnectPreviousAndNextOnLine(previous_on_line_node, |
| inline_text_box_node); |
| } else { |
| line_helper.StartNewLine(text_run_index); |
| } |
| line_helper.ProcessNextRun(text_run_index); |
| |
| if (text_run_index < builder_->text_runs().size() - 1) { |
| if (line_helper.IsRunOnSameLine(text_run_index + 1)) { |
| // The next run is on the same line. |
| previous_on_line_node = inline_text_box_node; |
| } else { |
| // The next run is on a new line. |
| previous_on_line_node = nullptr; |
| } |
| } |
| } |
| |
| if (text_run_index == builder_->text_runs().size() - 1) { |
| BuildStaticNode(&static_text_node, &static_text); |
| break; |
| } |
| |
| if (!previous_on_line_node) { |
| if (BreakParagraph(builder_->text_runs(), text_run_index, |
| paragraph_spacing_threshold_)) { |
| BuildStaticNode(&static_text_node, &static_text); |
| block_node = nullptr; |
| } |
| } |
| } |
| |
| #if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE) |
| // Add the wrapper node if still in OCR block and text runs finish. |
| if (ocr_block) { |
| builder_->page_node()->child_ids.push_back( |
| builder_ |
| ->CreateOcrWrapperNode( |
| builder_->text_runs().back().bounds.bottom_right(), |
| /*start=*/false) |
| ->id); |
| } |
| |
| AddRemainingAnnotations(block_node, has_ocr_text); |
| #else |
| AddRemainingAnnotations(block_node); |
| #endif |
| } |
| |
| ui::AXNodeData* PdfAccessibilityTreeBuilderHeuristic::CreateBlockLevelNode( |
| const std::string& text_run_type, |
| float font_size) { |
| ui::AXNodeData* block_node = builder_->CreateAndAppendNode( |
| StructureElementTypeToAccessibilityRole(text_run_type), |
| ax::mojom::Restriction::kReadOnly); |
| block_node->AddBoolAttribute(ax::mojom::BoolAttribute::kIsLineBreakingObject, |
| true); |
| if (std::optional<uint32_t> level = |
| StructureElementTypeToHeadingLevel(text_run_type); |
| level) { |
| block_node->AddIntAttribute(ax::mojom::IntAttribute::kHierarchicalLevel, |
| *level); |
| // TODO(crbug.com/40707542): Set the HTML tag to "h*" by creating a helper |
| // in `AXEnumUtils`. |
| } |
| |
| if (builder_->mark_headings_using_heuristic() && |
| heading_font_size_threshold_ > 0 && |
| font_size > heading_font_size_threshold_) { |
| block_node->role = ax::mojom::Role::kHeading; |
| block_node->AddIntAttribute(ax::mojom::IntAttribute::kHierarchicalLevel, 2); |
| block_node->AddStringAttribute(ax::mojom::StringAttribute::kHtmlTag, "h2"); |
| } |
| |
| return block_node; |
| } |
| |
| void PdfAccessibilityTreeBuilderHeuristic::AddTextToAXNode( |
| size_t start_text_run_index, |
| uint32_t end_text_run_index, |
| ui::AXNodeData* ax_node, |
| ui::AXNodeData** previous_on_line_node) { |
| chrome_pdf::PageCharacterIndex page_char_index = { |
| builder_->page_index(), |
| builder_->text_run_start_indices()[start_text_run_index]}; |
| ui::AXNodeData* ax_static_text_node = |
| builder_->CreateStaticTextNode(page_char_index); |
| ax_node->child_ids.push_back(ax_static_text_node->id); |
| // Accumulate the text of the node. |
| std::string ax_name; |
| LineHelper line_helper(builder_->text_runs()); |
| |
| for (size_t text_run_index = start_text_run_index; |
| text_run_index <= end_text_run_index; ++text_run_index) { |
| const chrome_pdf::AccessibilityTextRunInfo& text_run = |
| (builder_->text_runs())[text_run_index]; |
| page_char_index.char_index = |
| builder_->text_run_start_indices()[text_run_index]; |
| // Add this text run to the current static text node. |
| ui::AXNodeData* inline_text_box_node = |
| builder_->CreateInlineTextBoxNode(text_run, page_char_index); |
| ax_static_text_node->child_ids.push_back(inline_text_box_node->id); |
| |
| ax_static_text_node->relative_bounds.bounds.Union( |
| inline_text_box_node->relative_bounds.bounds); |
| ax_name += inline_text_box_node->GetStringAttribute( |
| ax::mojom::StringAttribute::kName); |
| |
| if (*previous_on_line_node) { |
| ConnectPreviousAndNextOnLine(*previous_on_line_node, |
| inline_text_box_node); |
| } else { |
| line_helper.StartNewLine(text_run_index); |
| } |
| line_helper.ProcessNextRun(text_run_index); |
| |
| if (text_run_index < builder_->text_runs().size() - 1) { |
| if (line_helper.IsRunOnSameLine(text_run_index + 1)) { |
| // The next run is on the same line. |
| *previous_on_line_node = inline_text_box_node; |
| } else { |
| // The next run is on a new line. |
| *previous_on_line_node = nullptr; |
| } |
| } |
| } |
| |
| ax_node->AddStringAttribute(ax::mojom::StringAttribute::kName, ax_name); |
| ax_static_text_node->AddStringAttribute(ax::mojom::StringAttribute::kName, |
| ax_name); |
| } |
| |
| void PdfAccessibilityTreeBuilderHeuristic::AddTextToObjectNode( |
| size_t object_text_run_index, |
| uint32_t object_text_run_count, |
| ui::AXNodeData* object_node, |
| ui::AXNodeData* para_node, |
| ui::AXNodeData** previous_on_line_node, |
| size_t* text_run_index) { |
| // Annotation objects can overlap in PDF. There can be two overlapping |
| // scenarios: Partial overlap and Complete overlap. |
| // Partial overlap |
| // |
| // Link A starts Link B starts Link A ends Link B ends |
| // |a1 |b1 |a2 |b2 |
| // ----------------------------------------------------------------------- |
| // Text |
| // |
| // Complete overlap |
| // Link A starts Link B starts Link B ends Link A ends |
| // |a1 |b1 |b2 |a2 |
| // ----------------------------------------------------------------------- |
| // Text |
| // |
| // For overlapping annotations, both annotations would store the full |
| // text data and nothing will get truncated. For partial overlap, link `A` |
| // would contain text between a1 and a2 while link `B` would contain text |
| // between b1 and b2. For complete overlap as well, link `A` would contain |
| // text between a1 and a2 and link `B` would contain text between b1 and |
| // b2. The links would appear in the tree in the order of which they are |
| // present. In the tree for both overlapping scenarios, link `A` would |
| // appear first in the tree and link `B` after it. |
| |
| // If `object_text_run_count` > 0, then the object is part of the page text. |
| // Make the text runs contained by the object children of the object node. |
| size_t end_text_run_index = object_text_run_index + object_text_run_count; |
| uint32_t object_end_text_run_index = |
| std::min(end_text_run_index, builder_->text_runs().size()) - 1; |
| AddTextToAXNode(object_text_run_index, object_end_text_run_index, object_node, |
| previous_on_line_node); |
| |
| para_node->relative_bounds.bounds.Union(object_node->relative_bounds.bounds); |
| |
| *text_run_index = |
| NormalizeTextRunIndex(object_end_text_run_index, *text_run_index); |
| } |
| |
| void PdfAccessibilityTreeBuilderHeuristic::AddLinkToParaNode( |
| const chrome_pdf::AccessibilityLinkInfo& link, |
| ui::AXNodeData* para_node, |
| ui::AXNodeData** previous_on_line_node, |
| size_t* text_run_index) { |
| ui::AXNodeData* link_node = builder_->CreateLinkNode(link); |
| para_node->child_ids.push_back(link_node->id); |
| |
| // If `link.text_range.count` == 0, then the link is not part of the page |
| // text. Push it ahead of the current text run. |
| if (link.text_range.count == 0) { |
| --(*text_run_index); |
| return; |
| } |
| |
| // Make the text runs contained by the link children of |
| // the link node. |
| AddTextToObjectNode(link.text_range.index, link.text_range.count, link_node, |
| para_node, previous_on_line_node, text_run_index); |
| } |
| |
| void PdfAccessibilityTreeBuilderHeuristic::AddImageToParaNode( |
| const chrome_pdf::AccessibilityImageInfo& image, |
| ui::AXNodeData* para_node, |
| size_t* text_run_index) { |
| // If the `text_run_index` is less than or equal to the image's text run |
| // index, then push the image ahead of the current text run. |
| ui::AXNodeData* image_node = builder_->CreateImageNode(image); |
| para_node->child_ids.push_back(image_node->id); |
| --(*text_run_index); |
| } |
| |
| void PdfAccessibilityTreeBuilderHeuristic::AddHighlightToParaNode( |
| const chrome_pdf::AccessibilityHighlightInfo& highlight, |
| ui::AXNodeData* para_node, |
| ui::AXNodeData** previous_on_line_node, |
| size_t* text_run_index) { |
| ui::AXNodeData* highlight_node = builder_->CreateHighlightNode(highlight); |
| para_node->child_ids.push_back(highlight_node->id); |
| |
| // Make the text runs contained by the highlight children of |
| // the highlight node. |
| AddTextToObjectNode(highlight.text_range.index, highlight.text_range.count, |
| highlight_node, para_node, previous_on_line_node, |
| text_run_index); |
| |
| if (!highlight.note_text.empty()) { |
| ui::AXNodeData* popup_note_node = builder_->CreatePopupNoteNode(highlight); |
| highlight_node->child_ids.push_back(popup_note_node->id); |
| } |
| } |
| |
| void PdfAccessibilityTreeBuilderHeuristic::AddTextFieldToParaNode( |
| const chrome_pdf::AccessibilityTextFieldInfo& text_field, |
| ui::AXNodeData* para_node, |
| size_t* text_run_index) { |
| // If the `text_run_index` is less than or equal to the text_field's text |
| // run index, then push the text_field ahead of the current text run. |
| ui::AXNodeData* text_field_node = builder_->CreateTextFieldNode(text_field); |
| para_node->child_ids.push_back(text_field_node->id); |
| --(*text_run_index); |
| } |
| |
| void PdfAccessibilityTreeBuilderHeuristic::AddButtonToParaNode( |
| const chrome_pdf::AccessibilityButtonInfo& button, |
| ui::AXNodeData* para_node, |
| size_t* text_run_index) { |
| // If the `text_run_index` is less than or equal to the button's text |
| // run index, then push the button ahead of the current text run. |
| ui::AXNodeData* button_node = builder_->CreateButtonNode(button); |
| para_node->child_ids.push_back(button_node->id); |
| --(*text_run_index); |
| } |
| |
| void PdfAccessibilityTreeBuilderHeuristic::AddChoiceFieldToParaNode( |
| const chrome_pdf::AccessibilityChoiceFieldInfo& choice_field, |
| ui::AXNodeData* para_node, |
| size_t* text_run_index) { |
| // If the `text_run_index` is less than or equal to the choice_field's text |
| // run index, then push the choice_field ahead of the current text run. |
| ui::AXNodeData* choice_field_node = |
| builder_->CreateChoiceFieldNode(choice_field); |
| para_node->child_ids.push_back(choice_field_node->id); |
| --(*text_run_index); |
| } |
| |
| void PdfAccessibilityTreeBuilderHeuristic::AddRemainingAnnotations( |
| ui::AXNodeData* para_node |
| #if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE) |
| , |
| bool ocr_applied |
| #endif |
| ) { |
| // If we don't have additional links, images or form fields to insert in the |
| // tree, then return. |
| if (current_link_index_ >= builder_->links().size() && |
| current_image_index_ >= builder_->images().size() && |
| current_text_field_index_ >= builder_->text_fields().size() && |
| current_button_index_ >= builder_->buttons().size() && |
| current_choice_field_index_ >= builder_->choice_fields().size()) { |
| return; |
| } |
| |
| // If we don't have a paragraph node, create a new one. |
| if (!para_node) { |
| para_node = builder_->CreateAndAppendNode( |
| ax::mojom::Role::kParagraph, ax::mojom::Restriction::kReadOnly); |
| builder_->page_node()->child_ids.push_back(para_node->id); |
| } |
| // Push all the links not anchored to any text run to the last paragraph. |
| for (size_t i = current_link_index_; i < builder_->links().size(); i++) { |
| ui::AXNodeData* link_node = |
| builder_->CreateLinkNode((builder_->links())[i]); |
| para_node->child_ids.push_back(link_node->id); |
| } |
| |
| // Push all the images not anchored to any text run to the last paragraph |
| // unless OCR has run. PDF Searchify either OCRs all images on a page, or none |
| // of them. |
| bool push_remaining_images = true; |
| #if BUILDFLAG(ENABLE_SCREEN_AI_SERVICE) |
| push_remaining_images = !ocr_applied; |
| #endif |
| if (push_remaining_images) { |
| for (size_t i = current_image_index_; i < builder_->images().size(); i++) { |
| const chrome_pdf::AccessibilityImageInfo& image_info = |
| (builder_->images())[i]; |
| ui::AXNodeData* image_node = builder_->CreateImageNode(image_info); |
| para_node->child_ids.push_back(image_node->id); |
| } |
| } |
| |
| if (base::FeatureList::IsEnabled(chrome_pdf::features::kAccessiblePDFForm)) { |
| // Push all the text fields not anchored to any text run to the last |
| // paragraph. |
| for (size_t i = current_text_field_index_; |
| i < builder_->text_fields().size(); i++) { |
| ui::AXNodeData* text_field_node = |
| builder_->CreateTextFieldNode((builder_->text_fields())[i]); |
| para_node->child_ids.push_back(text_field_node->id); |
| } |
| |
| // Push all the buttons not anchored to any text run to the last |
| // paragraph. |
| for (size_t i = current_button_index_; i < builder_->buttons().size(); |
| i++) { |
| ui::AXNodeData* button_node = |
| builder_->CreateButtonNode((builder_->buttons())[i]); |
| para_node->child_ids.push_back(button_node->id); |
| } |
| |
| // Push all the choice fields not anchored to any text run to the last |
| // paragraph. |
| for (size_t i = current_choice_field_index_; |
| i < builder_->choice_fields().size(); i++) { |
| ui::AXNodeData* choice_field_node = |
| builder_->CreateChoiceFieldNode((builder_->choice_fields())[i]); |
| para_node->child_ids.push_back(choice_field_node->id); |
| } |
| } |
| } |
| |
| } // namespace pdf |