代码之家  ›  专栏  ›  技术社区  ›  Essex

ElasticSearch:在我的django web应用程序中索引文档

  •  0
  • Essex  · 技术社区  · 6 年前

    我第一次尝试使用 ElasticSearch 在我的网络应用程序和我有困难与我的 ES indexation .

    1. 当我添加一个带有upload字段的文档时,该文档由ES索引
    2. 我有一个函数可以定义一个新的文档标题
    3. 我必须用新标题重新索引此文档,以便出现在ES文档列表中

    最后一部分我有些问题,我想知道你是否能帮我。

    实际进程:

    型号.py文件:

    class Document(EdqmFullTable):
        CAT_CHOICES = (...)
    
        ...
        file = models.FileField(upload_to=upload_file)
    
        def get_filename(self):
            return os.path.join(settings.MEDIA_ROOT, str(self.file))
    

    通过此模型添加新文档时,将调用ES方法:

    es4级omcl.py公司文件:

    class EdqmES(object):
        host = 'localhost'
        port = 9200
        es = None
    
        def __init__(self, *args, **kwargs):
            self.host = kwargs.pop('host', self.host)
            self.port = kwargs.pop('port', self.port)
    
            # Connect to ElasticSearch server
            self.es = Elasticsearch([{
                'host': self.host,
                'port': self.port
            }])
    
        def __str__(self):
            return self.host + ':' + self.port
    
        @staticmethod
        def file_encode(filename):
            with open(filename, "rb") as f:
                return b64encode(f.read()).decode('utf-8')
    
        def create_pipeline(self):
            body = {
                "description": "Extract attachment information",
                "processors": [
                    {"attachment": {
                        "field": "data",
                        "target_field": "attachment",
                        "indexed_chars": -1
                    }},
                    {"remove": {"field": "data"}}
                ]
            }
            self.es.index(
                index='_ingest',
                doc_type='pipeline',
                id='attachment',
                body=body
            )
    
        def index_document(self, doc, bulk=False):
            filename = doc.get_filename()
    
            try:
                data = self.file_encode(filename)
            except IOError:
                data = ''
                print('ERROR with ' + filename)
                # TODO: log error
    
            item_body = {
                '_id': doc.id,
                'data': data,
                'relative_path': str(doc.file),
                'title': doc.title,
            }
    
            if bulk:
                return item_body
    
            result1 = self.es.index(
                index='omcl', doc_type='annual-report',
                id=doc.id,
                pipeline='attachment',
                body=item_body,
                request_timeout=60
            )
            print(result1)
            return result1
    

    信号来自 回调.py 将新文档保存到数据库时的文件:

    @receiver(signals.post_save, sender=Document, dispatch_uid='add_new_doc')
    def add_document_handler(sender, instance=None, created=False, **kwargs):
        """ When a document is created index new annual report (only) with Elasticsearch and update conformity date if the
        document is a new declaration of conformity
    
        :param sender: Class which is concerned
        :type sender: the model class
        :param instance: Object which was just saved
        :type instance: model instance
        :param created: True for a creation, False for an update
        :type created: boolean
        :param kwargs: Additional parameter of the signal
        :type kwargs: dict
        """
    
        if not created:
            return
    
        # Update Conformity declaration date
        if instance.category == Document.OPT_CD:
            now = datetime.today()
            Omcl.objects.filter(id=instance.omcl_id).update(last_conformity=now)
    
        # Index only annual reports
        elif instance.category == Document.OPT_ANNUAL:
            es = EdqmES()
            es.index_document(instance)
    

    我的流程:

    我定义了一个新的类,允许在文档上传后立即处理它们。我可以修改文档标题。最后一步是:用ES重新索引这个修改过的文档。

    class ManageDocView(AdminRequiredMixin, View, BaseException):
        """ Render the Admin Manage documents to update year in the filename"""
    
        template_name = 'omcl/manage_doc_form.html'
        form_class = ManageDocForm
        success_url = 'omcl/manage_doc_form.html'
    
        def get(self, request):
            form = self.form_class()
            context = {
                "form": form
            }
            return render(request, self.template_name, context)
    
        def post(self, request):
            form = self.form_class()
            query_document_updated = None
            query_omcl = None
            query_document = None
    
            if "SearchOMCL" in request.POST:
                omcl_list = request.POST['omcl_list']
                query_omcl = Omcl.objects.get(id=omcl_list)
                query_document = Document.objects.filter(omcl=omcl_list)
    
    
            elif "UpdateDocument" in request.POST:
                checkbox_id = request.POST['DocumentChoice']
                checkbox_id_minus_1 = int(checkbox_id) - 1
    
                query_document_updated = Document.objects.get(id=checkbox_id)
                print(query_document_updated.id)
    
                omclcode = query_document_updated.omcl.code
                src_filename = query_document_updated.src_filename
                filename, file_extension = os.path.splitext(src_filename)
                category = query_document_updated.category
    
                if category == "ANNUAL":
                    category = "ANNUAL_REPORT"
    
                year = self.request.POST.get('q1year')
    
                # Create the new document title updated by the new year
                new_document_title = f"{year}_{category}_{omclcode}_{checkbox_id_minus_1} - {src_filename}"
    
                # Create the new document file updated by the new year
                new_document_file = f"omcl_docs/{omclcode}/{year}_{category}_{omclcode}_{checkbox_id_minus_1}{file_extension}"
    
                # Get file.name in order to rename document file in /media/
                document_path = query_document_updated.file.name
    
                try:
                    actual_document_path = os.path.join(settings.MEDIA_ROOT, document_path)
                    new_document_path_temp = settings.MEDIA_ROOT + "/" + new_document_file
                    new_document_path = os.rename(actual_document_path, new_document_path_temp)
                except FileNotFoundError:
                    messages.error(self.request, _(f"Document {src_filename} doesn't exist in the server"))
                    return redirect('manage_doc')
                else:
                    # Assign modifications to selected document and save it into the database
                    query_document_updated.title = new_document_title
                    query_document_updated.file = new_document_file
                    query_document_updated.save()
                    messages.success(self.request, _(f"The modification has been taken account"))
    
            context = {
                'form': form,
                'query_omcl': query_omcl,
                'query_document': query_document,
                'query_document_updated': query_document_updated,
            }
            return render(request, self.template_name, context)
    

    我完全不明白,因为我不知道怎样才能把这部分改编成电影回调.py文件:

    if not created:
        return
    

    ManageDocView()

    0 回复  |  直到 6 年前