Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
2.4 kB
2
Indexable
Never
---
version: v2.1
job_type: spark3.1.1
cluster: hdp-z502-prod

conda_pack: data_science/matcher/ml-products/prediction-pipeline:v7.0.6
hdfs_warehouse_path: hdfs:///warehouse/matcher/

spark_config:
  files:
    electronics:
      path: hdfs:///warehouse/ml_products_team/ds/files/models/electronics_v2_20230417_ce.cbm
    # electronics_v2:
      # path: hdfs:///warehouse/ml_products_team/ds/files/models/electronics_v3_20230706_ce_pretrained.cbm
    # all_v3:
      # path: hdfs:///warehouse/ml_products_team/ds/files/models/all_v3_20230831.cbm

  context:
    spark.task.cpus: 2
    spark.yarn.queue: ml_products
    spark.driver.cores: 4
    spark.driver.memory: 8g
    spark.executor.cores: 8
    spark.executor.memory: 8g
    spark.memory.fraction: 0.6
    spark.executor.instances: 80
    spark.driver.maxResultSize: 4g
    spark.driver.memoryOverhead: 2g
    spark.executor.memoryOverhead: 6g
    spark.shuffle.service.enabled: true
    spark.dynamicAllocation.enabled: false
    spark.sql.autoBroadcastJoinThreshold: -1
    spark.sql.execution.arrow.pyspark.enabled: true
    spark.sql.sources.partitionOverwriteMode: dynamic


callable: matcher_prediction_pipeline.multi_inference.main

input_data:
  candidates:
    type: hive
    # table: matcher.ozon_ozon_candidates_features
    table: ml_products.all_categories_v3_20230814_train_dataset_finally
  prod_table:
    type: hive
    table: matcher.ozon_data_etl_v2_prod
    # table: ml_products.all_categories_v3_20230814_train_dataset_finally

output_data:
  prediction:
    type: hive
    table: mzakhvataev_db.ozon_ozon_predictions_electronics_v2
    insertInto:
      overwrite: true
      on_missing: create
    partitionBy: date

extra_kwargs:
  tasks_per_worker: 12
  tmp_output_table: mzakhvataev_db.ozon_ozon_predictions_tmp_common
  prod_table_key: variantid
  models_config:
    electronics:
      - "15621042" # Электроника
    # electronics_v2:
      # - "15621042" # Электроника
    # all_v3:
      # - "15621042" # Электроника
  
  extra_columns:
    - [source, string]
  data:
    first:
      key: variantid
      input_data: prod_table
      candidates_key: variantid1
    second:
      key: variantid
      input_data: prod_table
      candidates_key: variantid2

_submit_params:
  explain: 'false'
  timeout: '60'
  execution_date: '2023-09-23T01:00:00+00:00'