OS used for this experiment: Ubuntu 16.04.2 LTS
add-apt-repository ppa:webupd8team/java
apt-get update
apt-get install oracle-java8-installer
apt-get install ant
ulimit -f unlimited \
-t unlimited \
-v unlimited \
-n 64000 \
-m unlimited \
-u 64000
git clone https://github.com/apache/nutch.git
ant clean runtime
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>http.agent.name</name>
<value>Spiderman</value>
</property>
<property>
<name>plugin.includes</name>
<value>protocol-(http|httpclient)|urlfilter-regex|index-(basic|more)|query-(basic|site|url|lang)|indexer-elastic|indexer-elastic-rest|nutch-extensionpoints|parse-(text|html|msexcel|msword|mspowerpoint|pdf)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)|parse-(html|tika|metatags)|index-(basic|anchor|more|metadata)</value>
</property>
<property>
<name>parser.character.encoding.default</name>
<value>utf-8</value>
</property>
<property>
<name>elastic.rest.host</name>
<value>aws-elastic-search-endpoint.example.org</value>
</property>
<property>
<name>elastic.rest.port</name>
<value>443</value>
</property>
<property>
<name>elastic.rest.index</name>
<value>nutch</value>
</property>
<property>
<name>elastic.rest.type</name>
<value>doc</value>
</property>
<property>
<name>elastic.rest.https</name>
<value>true</value>
</property>
<property>
<name>elastic.rest.trustallhostnames</name>
<value>false</value>
</property>
</configuration>
No comments:
Post a Comment