OS used for this experiment: Ubuntu 16.04.2 LTS
add-apt-repository ppa:webupd8team/javaapt-get updateapt-get install oracle-java8-installerapt-get install antulimit -f unlimited \ -t unlimited \ -v unlimited \ -n 64000 \ -m unlimited \ -u 64000git clone https://github.com/apache/nutch.gitant clean runtime<?xml version="1.0"?><?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration> <property> <name>http.agent.name</name> <value>Spiderman</value> </property>
<property> <name>plugin.includes</name> <value>protocol-(http|httpclient)|urlfilter-regex|index-(basic|more)|query-(basic|site|url|lang)|indexer-elastic|indexer-elastic-rest|nutch-extensionpoints|parse-(text|html|msexcel|msword|mspowerpoint|pdf)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)|parse-(html|tika|metatags)|index-(basic|anchor|more|metadata)</value> </property>
<property> <name>parser.character.encoding.default</name> <value>utf-8</value> </property> <property> <name>elastic.rest.host</name> <value>aws-elastic-search-endpoint.example.org</value> </property>
<property> <name>elastic.rest.port</name> <value>443</value> </property>
<property> <name>elastic.rest.index</name> <value>nutch</value> </property>
<property> <name>elastic.rest.type</name> <value>doc</value> </property>
<property> <name>elastic.rest.https</name> <value>true</value> </property>
<property> <name>elastic.rest.trustallhostnames</name> <value>false</value> </property>
</configuration>
No comments:
Post a Comment