import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.sql.SparkSession
object DocumentSimilarity {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("DocumentSimilarity")
.master("local")
.getOrCreate()
val documentPaths = Seq(
"D:\\下载\\A Tale of Two Cities - Charles Dickens.txt",
"D:\\下载\\David Copperfield - Charles Dickens.txt"
)
val documents = spark.read.textFile(documentPaths: _*)
.rdd.zipWithIndex()
.map { case (line, id) => (id, line) }
.toDF("id", "text")
val tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words")
val wordsData = tokenizer.transform(documents)
val hashingTF = new HashingTF()
.setInputCol("words")
.setOutputCol("rawFeatures")
.setNumFeatures(10000)
val featurizedData = hashingTF.transform(wordsData)
val idf = new IDF()
.setInputCol("rawFeatures")
.setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
val doc1 = rescaledData.filter($"id" === 0).select("features").head()
val doc2 = rescaledData.filter($"id" === 1).select("features").head()
val similarity = doc1.getAs[org.apache.spark.ml.linalg.Vector](0).dot(doc2.getAs[org.apache.spark.ml.linalg.Vector](0)) /
(doc1.getAs[org.apache.spark.ml.linalg.Vector](0).norm(2) * doc2.getAs[org.apache.spark.ml.linalg.Vector](0).norm(2))
println(s"The similarity between document 1 and document 2 is: $similarity")
spark.stop()
}
}