@inproceedings{10.1145/3654777.3676450,
author = {Shankar, Shreya and Zamfirescu-Pereira, J.D. and Hartmann, Bjoern and Parameswaran, Aditya and Arawjo, Ian},
title = {Who Validates the Validators? Aligning LLM-Assisted Evaluation of LLM Outputs with Human Preferences},
year = {2024},
isbn = {9798400706288},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3654777.3676450},
doi = {10.1145/3654777.3676450},
abstract = {Due to the cumbersome nature of human evaluation and limitations of code-based evaluation, Large Language Models (LLMs) are increasingly being used to assist humans in evaluating LLM outputs. Yet LLM-generated evaluators simply inherit all the problems of the LLMs they evaluate, requiring further human validation. We present a mixed-initiative approach to â€œvalidate the validatorsâ€â€”aligning LLM-generated evaluation functions (be it prompts or code) with human requirements. Our interface, EvalGen, provides automated assistance to users in generating evaluation criteria and implementing assertions. While generating candidate implementations (Python functions, LLM grader prompts), EvalGen asks humans to grade a subset of LLM outputs; this feedback is used to select implementations that better align with user grades. A qualitative study finds overall support for EvalGen but underscores the subjectivity and iterative nature of alignment. In particular, we identify a phenomenon we dub criteria drift: users need criteria to grade outputs, but grading outputs helps users define criteria. What is more, some criteria appear dependent on the specific LLM outputs observed (rather than independent and definable a priori), raising serious questions for approaches that assume the independence of evaluation from observation of model outputs. We present our interface and implementation details, a comparison of our algorithm with a baseline approach, and implications for the design of future LLM evaluation assistants.},
booktitle = {Proceedings of the 37th Annual ACM Symposium on User Interface Software and Technology},
articleno = {131},
numpages = {14},
keywords = {active learning, auditing, evaluation, interfaces, language models, prompt engineering},
location = {Pittsburgh, PA, USA},
series = {UIST '24}
}