Anyway, there are lots of similar HOWTOs out there, but this one points to specific versions and URLs so you can just copy and paste these to get your system up and running in a few minutes.
apt-get update
apt-get dist-upgrade
reboot #if necessary
add-apt-repository ppa:webupd8team/java
apt-get update
apt-get install oracle-java8-installer
apt-get install ant
ulimit -f unlimited \
apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 0C49F3730359A14518585931BC711F9BA15703C6
echo "deb [ arch=amd64,arm64 ] http://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/3.4 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-3.4.list
apt-get update
apt-get install mongodb-org
service mongod start
echo "use nutch" | mongo
wget http://apache.cs.utah.edu/nutch/2.3.1/apache-nutch-2.3.1-src.tar.gz
tar xvf apache-nutch-2.3.1-src.tar.gz
<dependency org="org.apache.gora" name="gora-mongodb" rev="0.6.1" conf="*->default" />
gora.datastore.default=org.apache.gora.mongodb.store.MongoStore
gora.mongodb.override_hadoop_configuration=false
gora.mongodb.mapping.file=/gora-mongodb-mapping.xml
gora.mongodb.servers=localhost:27017
gora.mongodb.db=nutch
ant runtime #needs to be run in the top directory
# It's possible to use the latest version, ES5, but you need to update the indexer.
wget -qO - https://packages.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add -
echo "deb http://packages.elastic.co/elasticsearch/1.7/debian stable main" | sudo tee -a /etc/apt/sources.list.d/elasticsearch-1.7.list
apt-get update && sudo apt-get install elasticsearch
network.host: 127.0.0.1
cluster.name: nutch
node.name: nutch1
update-rc.d elasticsearch defaults
/etc/init.d/elasticsearch restart
echo "deb http://packages.elastic.co/kibana/4.4/debian stable main" | sudo tee -a /etc/apt/sources.list.d/kibana-4.4.x.list
apt-get update && sudo apt-get install kibana
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>storage.data.store.class</name>
<value>org.apache.gora.mongodb.store.MongoStore</value>
<description>Default class for storing data</description>
</property>
<property>
<name>http.agent.name</name>
<value>Spiderman</value>
</property>
<property>
<name>plugin.includes</name>
<value>protocol-(http|httpclient)|urlfilter-regex|index-(basic|more)|query-(basic|site|url|lang)|indexer-elastic|nutch-extensionpoints|parse-(text|html|msexcel|msword|mspowerpoint|pdf)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)|parse-(html|tika|metatags)|index-(basic|anchor|more|metadata)</value>
</property>
<property>
<name>elastic.host</name>
<value>127.0.0.1</value>
</property>
<property>
<name>elastic.port</name>
<value>9300</value>
</property>
<property>
<name>elastic.cluster</name>
<value>nutch</value>
</property>
<property>
<name>elastic.index</name>
<value>nutch</value>
</property>
<property>
<name>parser.character.encoding.default</name>
<value>utf-8</value>
</property>
<property>
<name>http.content.limit</name>
<value>6553600</value>
</property>
<property>
<name>elastic.max.bulk.docs</name>
<value>2000</value>
</property>
<property>
<name>elastic.max.bulk.size</name>
<value>2500500</value>
</property>
</configuration>
No comments:
Post a Comment