1<?php
2
3$extractParams = new SolrModifiableParams();
4$extractParams
5//  index the document, using the unique ID: doc1
6    ->set(SolrExtractRequest::LITERALS_PREFIX . 'id', 'doc1')
7
8// capture what is inside paragraph tags
9    ->set(SolrExtractRequest::CAPTURE_ELEMENTS, 'p')
10
11// Indexes attributes of the Tika XHTML elements into separate fields
12    ->set(SolrExtractRequest::CAPTURE_ATTRIBUTES, 'true')
13
14// map p content to solr field
15    ->set(
16        SolrExtractRequest::FIELD_MAPPING_PREFIX . 'p',
17        'an_indexed_field_name_that_holds_paragraphs'
18        )
19
20// capture unmapped content here
21    ->set(SolrExtractRequest::DEFAULT_FIELD, '__text__')
22
23// restrict capturing to matching xpath expression
24->set(
25    SolrExtractRequest::XPATH_EXPRESSION,
26    '/xhtml:html/xhtml:body/xhtml:div//node()'
27    )
28;
29
30$binContent = file_get_contents('somefile.pdf');
31// please reference docs/documentation.php for the rest of the parameters
32
33$extractRequest = SolrExtractRequest::createFromStream($binContent, 'application/pdf', $extractParams);
34$response = $client->sendUpdateStream($extractRequest);
35
36